Пример #1
0
    def test_combined_iterator_reporting(self, mnist_factory):
        iterator_train, iterator_train_meta = mnist_factory.get_dataset_iterator(
            split="train")
        iterator_test, iterator_test_meta = mnist_factory.get_dataset_iterator(
            split="test")
        meta_train = MetaFactory.get_dataset_meta(
            identifier="id x",
            dataset_name="MNIST",
            dataset_tag="train",
            iterator_meta=iterator_train_meta)
        meta_test = MetaFactory.get_dataset_meta(
            identifier="id x",
            dataset_name="MNIST",
            dataset_tag="train",
            iterator_meta=iterator_test_meta)

        informed_iterator_train = InformedDatasetFactory.get_dataset_iterator(
            iterator_train, meta_train)
        informed_iterator_test = InformedDatasetFactory.get_dataset_iterator(
            iterator_test, meta_test)

        meta_combined = MetaFactory.get_dataset_meta_from_existing(
            informed_iterator_train.dataset_meta, dataset_tag="full")

        iterator = InformedDatasetFactory.get_combined_dataset_iterator(
            [informed_iterator_train, informed_iterator_test], meta_combined)
        report = DatasetIteratorReportGenerator.generate_report(iterator)
        assert report.length == 70000 and report.sub_reports[
            0].length == 60000 and report.sub_reports[1].length == 10000
        assert not report.sub_reports[
            0].sub_reports and not report.sub_reports[1].sub_reports
Пример #2
0
 def dataset_meta(self) -> DatasetMeta:
     iterator_meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                                   target_pos=1,
                                                   tag_pos=2)
     return MetaFactory.get_dataset_meta(identifier="identifier_1",
                                         dataset_name="TEST DATASET",
                                         dataset_tag="train",
                                         iterator_meta=iterator_meta)
Пример #3
0
    def iterator(self) -> str:
        targets = [1]*100 + [2]*200 + [3]*300
        sequence_targets = torch.Tensor(targets)
        sequence_samples = torch.ones_like(sequence_targets)

        iterator = SequenceDatasetIterator([sequence_samples, sequence_targets])
        iterator_meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=1)
        meta = MetaFactory.get_dataset_meta(identifier="dataset id",
                                            dataset_name="dataset",
                                            dataset_tag="full",
                                            iterator_meta=iterator_meta)
        return InformedDatasetFactory.get_dataset_iterator(iterator, meta)
Пример #4
0
 def get_shuffled_iterator(identifier: str,
                           iterator: InformedDatasetIteratorIF,
                           seed: int) -> InformedDatasetIteratorIF:
     meta = MetaFactory.get_dataset_meta_from_existing(
         iterator.dataset_meta, identifier=identifier)
     return InformedDatasetFactory.get_shuffled_dataset_iterator(
         iterator, meta, seed)
Пример #5
0
 def get_in_memory_iterator(
         identifier: str,
         iterator: InformedDatasetIteratorIF) -> InformedDatasetIteratorIF:
     meta = MetaFactory.get_dataset_meta_from_existing(
         iterator.dataset_meta, identifier=identifier)
     return InformedDatasetFactory.get_in_memory_dataset_iterator(
         iterator, meta)
Пример #6
0
 def _get_iterator(self, split: str):
     dataset_identifier = self._get_resource_id(element="reuters.hdf5")
     dataset_resource = self.storage_connector.get_resource(
         identifier=dataset_identifier)
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return ReutersIterator(dataset_resource, split), meta
Пример #7
0
 def _get_iterator(self):
     dataset_identifier = self._get_resource_id(element="news_groups.hdf5")
     dataset_resource = self.storage_connector.get_resource(
         identifier=dataset_identifier)
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return NewsGroupsIterator(dataset_resource), meta
Пример #8
0
 def _get_iterator(self, split: str):
     dataset_identifier = self._get_resource_id(element=f"{split}.pd")
     dataset_resource = self.storage_connector.get_resource(
         identifier=dataset_identifier)
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return KDDIterator(dataset_resource), meta
Пример #9
0
 def _get_iterator(self,
                   split: str,
                   length: float,
                   num_samples: List[int],
                   seed: int = 1,
                   translation: List[int] = None):
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return XORSquaresIterator(seed, length, num_samples, translation), meta
Пример #10
0
 def _get_iterator(self, split: str):
     """Supported splits: train, val, test
     """
     dataset_identifier = self._get_resource_id(element="atis_dataset.hdf5")
     dataset_resource = self.storage_connector.get_resource(
         identifier=dataset_identifier)
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return AtisIterator(dataset_resource, split), meta
Пример #11
0
 def _get_iterator(self,
                   noise_std: float,
                   interval: List[float],
                   num_samples: int,
                   seed: int = 1):
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return NoisyXCubedIterator(seed, noise_std, interval,
                                num_samples), meta
Пример #12
0
 def get_iterator_view(
         identifier: str, iterator: InformedDatasetIteratorIF,
         selection_fun: Callable[[DatasetIteratorIF], List[int]],
         view_tags: Dict[str, Any]) -> InformedDatasetIteratorIF:
     valid_indices = selection_fun(iterator)
     # valid_indices = list(np.argwhere(valid_mask).flatten())
     meta = MetaFactory.get_dataset_meta_from_existing(
         iterator.dataset_meta, identifier=identifier)
     return InformedDatasetFactory.get_dataset_iterator_view(
         iterator, meta, valid_indices, view_tags)
Пример #13
0
 def get_filtered_labels_iterator(
         identifier: str, iterator: InformedDatasetIteratorIF,
         filtered_labels: List[Any]) -> InformedDatasetIteratorIF:
     valid_indices = [
         i for i in range(len(iterator))
         if iterator[i][iterator.dataset_meta.target_pos] in filtered_labels
     ]
     meta = MetaFactory.get_dataset_meta_from_existing(
         iterator.dataset_meta, identifier=identifier)
     return InformedDatasetFactory.get_dataset_iterator_view(
         iterator, meta, valid_indices)
Пример #14
0
 def test_combined_iterator_reporting(self, informed_dataset_iterator):
     meta_combined = MetaFactory.get_dataset_meta_from_existing(
         informed_dataset_iterator.dataset_meta, dataset_tag="full")
     iterator = InformedDatasetFactory.get_combined_dataset_iterator(
         [informed_dataset_iterator, informed_dataset_iterator],
         meta_combined)
     report = DatasetIteratorReportGenerator.generate_report(iterator)
     assert report.length == 2180 and report.sub_reports[
         0].length == 1090 and report.sub_reports[1].length == 1090
     assert not report.sub_reports[
         0].sub_reports and not report.sub_reports[1].sub_reports
Пример #15
0
 def get_mapped_labels_iterator(
         identifier: str, iterator: DatasetIteratorIF,
         mappings: Dict) -> InformedDatasetIteratorIF:
     label_mapper_post_processor = LabelMapperPostProcessor(
         mappings=mappings,
         target_position=iterator.dataset_meta.target_pos,
         tag_position=iterator.dataset_meta.tag_pos)
     meta = MetaFactory.get_dataset_meta_from_existing(
         iterator.dataset_meta, identifier=identifier)
     return InformedDatasetFactory.get_dataset_iterator(
         PostProcessedDatasetIterator(iterator,
                                      label_mapper_post_processor), meta)
Пример #16
0
 def _get_iterator(self,
                   split: str,
                   scale_factor: float,
                   noise_std: float,
                   num_samples: List[int],
                   seed: int = 1,
                   translation: List[int] = None):
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return CirclesIterator(seed, noise_std, num_samples, scale_factor,
                            translation), meta
Пример #17
0
 def _get_iterator(self,
                   split: str,
                   noise_std: float,
                   num_samples: List[int],
                   seed: int = 1,
                   translation: List[float] = None,
                   scaling: List[int] = None):
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return HalfMoonIterator(seed, noise_std, num_samples, translation,
                             scaling), meta
Пример #18
0
 def _get_iterator(self,
                   split: str,
                   num_samples: List[int],
                   classes: List[int],
                   hypercube: List[Tuple[int, int]],
                   seed: int = 1):
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return UniformNoiseIterator(seed=seed,
                                 num_samples=num_samples,
                                 classes=classes,
                                 hypercube=hypercube), meta
Пример #19
0
 def get_one_hot_encoded_target_iterators(
         identifier: str, iterators: Dict[str, InformedDatasetIteratorIF],
         target_vector_size: int) -> Dict[str, DatasetIteratorIF]:
     target_position = list(iterators.items())[0][1].dataset_meta.target_pos
     postprocessor = OneHotEncodedTargetPostProcessor(
         target_vector_size=target_vector_size,
         target_position=target_position)
     return {
         name: InformedDatasetFactory.get_dataset_iterator(
             PostProcessedDatasetIterator(iterator, postprocessor),
             MetaFactory.get_dataset_meta_from_existing(
                 iterator.dataset_meta, identifier=identifier))
         for name, iterator in iterators.items()
     }
Пример #20
0
    def test_plain_iterator_reporting(self, mnist_factory):
        iterator, iterator_meta = mnist_factory.get_dataset_iterator(
            split="train")
        dataset_meta = MetaFactory.get_dataset_meta(
            identifier="id x",
            dataset_name="MNIST",
            dataset_tag="train",
            iterator_meta=iterator_meta)

        informed_iterator = InformedDatasetIterator(iterator, dataset_meta)
        report = DatasetIteratorReportGenerator.generate_report(
            informed_iterator)
        print(report)
        assert report.length == 60000 and not report.sub_reports
Пример #21
0
 def _get_iterator(self):
     sample_identifier = self._get_resource_id(element="samples.pt")
     target_identifier = self._get_resource_id(element="targets.pt")
     sample_resource = self.storage_connector.get_resource(
         identifier=sample_identifier)
     target_resource = self.storage_connector.get_resource(
         identifier=target_identifier)
     text_sample_resource = StreamedTextResource.from_streamed_resouce(
         sample_resource)
     text_target_resource = StreamedTextResource.from_streamed_resouce(
         target_resource)
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return ArrhythmiaIterator(text_sample_resource,
                               text_target_resource), meta
Пример #22
0
 def _get_iterator(self,
                   split: str,
                   class_label: int,
                   radius: float,
                   start_degree: float,
                   end_degree: float,
                   num_samples: int,
                   seed: int = 1,
                   translation: List[int] = None,
                   noise_std: int = 0):
     meta = MetaFactory.get_iterator_meta(sample_pos=0,
                                          target_pos=1,
                                          tag_pos=2)
     return CircularSegmentIterator(seed, class_label, radius, start_degree,
                                    end_degree, num_samples, noise_std,
                                    translation), meta
Пример #23
0
 def get_feature_encoded_iterators(
     identifier: str, iterators: Dict[str, InformedDatasetIteratorIF],
     feature_encoding_configs: Dict[str, List[Any]]
 ) -> Dict[str, DatasetIteratorIF]:
     sample_position = list(iterators.items())[0][1].dataset_meta.sample_pos
     feature_encoder_post_processor = FeatureEncoderPostProcessor(
         sample_position=sample_position,
         feature_encoding_configs=feature_encoding_configs)
     feature_encoder_post_processor.fit(iterators)
     return {
         name: InformedDatasetFactory.get_dataset_iterator(
             PostProcessedDatasetIterator(iterator,
                                          feature_encoder_post_processor),
             MetaFactory.get_dataset_meta_from_existing(
                 iterator.dataset_meta, identifier=identifier))
         for name, iterator in iterators.items()
     }
Пример #24
0
 def _split(iterator: InformedDatasetIteratorIF, seed: int,
            split_config: Dict) -> Dict[str, InformedDatasetIteratorIF]:
     names = list(split_config.keys())
     ratios = list(split_config.values())
     if stratified:
         splitter = SplitterFactory.get_stratified_splitter(
             ratios=ratios, seed=seed)
     else:
         splitter = SplitterFactory.get_random_splitter(ratios=ratios,
                                                        seed=seed)
     splitted_iterators = splitter.split(iterator)
     dataset_metas = [
         MetaFactory.get_dataset_meta_from_existing(
             iterator.dataset_meta,
             identifier=identifier,
             dataset_tag=name) for name in names
     ]
     return {
         name: InformedDatasetFactory.get_dataset_iterator(
             splitted_iterators[i], dataset_metas[i])
         for i, name in enumerate(names)
     }
Пример #25
0
    def get_combined_iterators(
            identifier: str, iterators: Dict[str,
                                             Dict[str,
                                                  InformedDatasetIteratorIF]],
            combine_configs: Dict) -> Dict[str, InformedDatasetIteratorIF]:
        """Combines iterators.

        Args:
            identifier (str):
            iterators (Dict[str, Dict[str, InformedDatasetIteratorIF]]): Dictionary mapping from iterator_name -> split_name -> iterator
            combine_configs (Dict):

        Returns:
            Dict[str, InformedDatasetIteratorIF]:
        """
        def get_iterators_to_be_combined(
                iterators: Dict[str, Dict[str, InformedDatasetIteratorIF]],
                split_config: List):
            return [
                iterators[element["iterators_name"]][split_name]
                for element in split_config for split_name in element["splits"]
            ]

        combined_iterators = {}
        for split_config in combine_configs:
            iterator_list = get_iterators_to_be_combined(
                iterators, split_config["old_splits"])
            meta = MetaFactory.get_dataset_meta_from_existing(
                dataset_meta=iterator_list[0].dataset_meta,
                identifier=identifier,
                dataset_name="combined_dataset",
                dataset_tag=None)
            combined_iterators[split_config[
                "new_split"]] = InformedDatasetFactory.get_dataset_iterator(
                    CombinedDatasetIterator(iterator_list), meta)
        return combined_iterators
Пример #26
0
 def _get_iterator(self, split: str, class_label: int, seed: int, num_samples: int, covariance: np.array, mean: Tuple[int, int]):
     meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2)
     return GaussianIterator(seed, class_label, num_samples, covariance, mean), meta
Пример #27
0
 def _get_iterator(self, split: str, high_level_targets: bool = True):
     dataset_identifier = self._get_resource_id(element="trec_dataset.hdf5")
     dataset_resource = self.storage_connector.get_resource(identifier=dataset_identifier)
     meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=2)
     return TrecIterator(dataset_resource, split, high_level_targets), meta