def test_combined_iterator_reporting(self, mnist_factory): iterator_train, iterator_train_meta = mnist_factory.get_dataset_iterator( split="train") iterator_test, iterator_test_meta = mnist_factory.get_dataset_iterator( split="test") meta_train = MetaFactory.get_dataset_meta( identifier="id x", dataset_name="MNIST", dataset_tag="train", iterator_meta=iterator_train_meta) meta_test = MetaFactory.get_dataset_meta( identifier="id x", dataset_name="MNIST", dataset_tag="train", iterator_meta=iterator_test_meta) informed_iterator_train = InformedDatasetFactory.get_dataset_iterator( iterator_train, meta_train) informed_iterator_test = InformedDatasetFactory.get_dataset_iterator( iterator_test, meta_test) meta_combined = MetaFactory.get_dataset_meta_from_existing( informed_iterator_train.dataset_meta, dataset_tag="full") iterator = InformedDatasetFactory.get_combined_dataset_iterator( [informed_iterator_train, informed_iterator_test], meta_combined) report = DatasetIteratorReportGenerator.generate_report(iterator) assert report.length == 70000 and report.sub_reports[ 0].length == 60000 and report.sub_reports[1].length == 10000 assert not report.sub_reports[ 0].sub_reports and not report.sub_reports[1].sub_reports
def get_shuffled_iterator(identifier: str, iterator: InformedDatasetIteratorIF, seed: int) -> InformedDatasetIteratorIF: meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_shuffled_dataset_iterator( iterator, meta, seed)
def get_in_memory_iterator( identifier: str, iterator: InformedDatasetIteratorIF) -> InformedDatasetIteratorIF: meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_in_memory_dataset_iterator( iterator, meta)
def get_iterator_view( identifier: str, iterator: InformedDatasetIteratorIF, selection_fun: Callable[[DatasetIteratorIF], List[int]], view_tags: Dict[str, Any]) -> InformedDatasetIteratorIF: valid_indices = selection_fun(iterator) # valid_indices = list(np.argwhere(valid_mask).flatten()) meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_dataset_iterator_view( iterator, meta, valid_indices, view_tags)
def test_combined_iterator_reporting(self, informed_dataset_iterator): meta_combined = MetaFactory.get_dataset_meta_from_existing( informed_dataset_iterator.dataset_meta, dataset_tag="full") iterator = InformedDatasetFactory.get_combined_dataset_iterator( [informed_dataset_iterator, informed_dataset_iterator], meta_combined) report = DatasetIteratorReportGenerator.generate_report(iterator) assert report.length == 2180 and report.sub_reports[ 0].length == 1090 and report.sub_reports[1].length == 1090 assert not report.sub_reports[ 0].sub_reports and not report.sub_reports[1].sub_reports
def get_filtered_labels_iterator( identifier: str, iterator: InformedDatasetIteratorIF, filtered_labels: List[Any]) -> InformedDatasetIteratorIF: valid_indices = [ i for i in range(len(iterator)) if iterator[i][iterator.dataset_meta.target_pos] in filtered_labels ] meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_dataset_iterator_view( iterator, meta, valid_indices)
def iterator(self) -> str: targets = [1]*100 + [2]*200 + [3]*300 sequence_targets = torch.Tensor(targets) sequence_samples = torch.ones_like(sequence_targets) iterator = SequenceDatasetIterator([sequence_samples, sequence_targets]) iterator_meta = MetaFactory.get_iterator_meta(sample_pos=0, target_pos=1, tag_pos=1) meta = MetaFactory.get_dataset_meta(identifier="dataset id", dataset_name="dataset", dataset_tag="full", iterator_meta=iterator_meta) return InformedDatasetFactory.get_dataset_iterator(iterator, meta)
def get_mapped_labels_iterator( identifier: str, iterator: DatasetIteratorIF, mappings: Dict) -> InformedDatasetIteratorIF: label_mapper_post_processor = LabelMapperPostProcessor( mappings=mappings, target_position=iterator.dataset_meta.target_pos, tag_position=iterator.dataset_meta.tag_pos) meta = MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier) return InformedDatasetFactory.get_dataset_iterator( PostProcessedDatasetIterator(iterator, label_mapper_post_processor), meta)
def get_one_hot_encoded_target_iterators( identifier: str, iterators: Dict[str, InformedDatasetIteratorIF], target_vector_size: int) -> Dict[str, DatasetIteratorIF]: target_position = list(iterators.items())[0][1].dataset_meta.target_pos postprocessor = OneHotEncodedTargetPostProcessor( target_vector_size=target_vector_size, target_position=target_position) return { name: InformedDatasetFactory.get_dataset_iterator( PostProcessedDatasetIterator(iterator, postprocessor), MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier)) for name, iterator in iterators.items() }
def get_feature_encoded_iterators( identifier: str, iterators: Dict[str, InformedDatasetIteratorIF], feature_encoding_configs: Dict[str, List[Any]] ) -> Dict[str, DatasetIteratorIF]: sample_position = list(iterators.items())[0][1].dataset_meta.sample_pos feature_encoder_post_processor = FeatureEncoderPostProcessor( sample_position=sample_position, feature_encoding_configs=feature_encoding_configs) feature_encoder_post_processor.fit(iterators) return { name: InformedDatasetFactory.get_dataset_iterator( PostProcessedDatasetIterator(iterator, feature_encoder_post_processor), MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier)) for name, iterator in iterators.items() }
def _split(iterator: InformedDatasetIteratorIF, seed: int, split_config: Dict) -> Dict[str, InformedDatasetIteratorIF]: names = list(split_config.keys()) ratios = list(split_config.values()) if stratified: splitter = SplitterFactory.get_stratified_splitter( ratios=ratios, seed=seed) else: splitter = SplitterFactory.get_random_splitter(ratios=ratios, seed=seed) splitted_iterators = splitter.split(iterator) dataset_metas = [ MetaFactory.get_dataset_meta_from_existing( iterator.dataset_meta, identifier=identifier, dataset_tag=name) for name in names ] return { name: InformedDatasetFactory.get_dataset_iterator( splitted_iterators[i], dataset_metas[i]) for i, name in enumerate(names) }
def get_combined_iterators( identifier: str, iterators: Dict[str, Dict[str, InformedDatasetIteratorIF]], combine_configs: Dict) -> Dict[str, InformedDatasetIteratorIF]: """Combines iterators. Args: identifier (str): iterators (Dict[str, Dict[str, InformedDatasetIteratorIF]]): Dictionary mapping from iterator_name -> split_name -> iterator combine_configs (Dict): Returns: Dict[str, InformedDatasetIteratorIF]: """ def get_iterators_to_be_combined( iterators: Dict[str, Dict[str, InformedDatasetIteratorIF]], split_config: List): return [ iterators[element["iterators_name"]][split_name] for element in split_config for split_name in element["splits"] ] combined_iterators = {} for split_config in combine_configs: iterator_list = get_iterators_to_be_combined( iterators, split_config["old_splits"]) meta = MetaFactory.get_dataset_meta_from_existing( dataset_meta=iterator_list[0].dataset_meta, identifier=identifier, dataset_name="combined_dataset", dataset_tag=None) combined_iterators[split_config[ "new_split"]] = InformedDatasetFactory.get_dataset_iterator( CombinedDatasetIterator(iterator_list), meta) return combined_iterators
def informed_dataset_iterator(self, dataset_iterator, dataset_meta) -> DatasetIteratorIF: return InformedDatasetFactory.get_dataset_iterator( dataset_iterator, dataset_meta)