def _datapipe( self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: images_dp, meta_dp = resource_dps if self._annotations is None: dp = hint_shuffling(images_dp) dp = hint_sharding(dp) dp = hint_shuffling(dp) return Mapper(dp, self._prepare_image) meta_dp = Filter(meta_dp, self._filter_meta_files) meta_dp = JsonParser(meta_dp) meta_dp = Mapper(meta_dp, getitem(1)) meta_dp: IterDataPipe[Dict[str, Dict[str, Any]]] = MappingIterator(meta_dp) images_meta_dp, anns_meta_dp = Demultiplexer( meta_dp, 2, self._classify_meta, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE, ) images_meta_dp = Mapper(images_meta_dp, getitem(1)) images_meta_dp = UnBatcher(images_meta_dp) anns_meta_dp = Mapper(anns_meta_dp, getitem(1)) anns_meta_dp = UnBatcher(anns_meta_dp) anns_meta_dp = Grouper(anns_meta_dp, group_key_fn=getitem("image_id"), buffer_size=INFINITE_BUFFER_SIZE) anns_meta_dp = hint_shuffling(anns_meta_dp) anns_meta_dp = hint_sharding(anns_meta_dp) anns_dp = IterKeyZipper( anns_meta_dp, images_meta_dp, key_fn=getitem(0, "image_id"), ref_key_fn=getitem("id"), buffer_size=INFINITE_BUFFER_SIZE, ) dp = IterKeyZipper( anns_dp, images_dp, key_fn=getitem(1, "file_name"), ref_key_fn=path_accessor("name"), buffer_size=INFINITE_BUFFER_SIZE, ) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: archive_dp = resource_dps[0] splits_dp, joint_categories_dp, images_dp = Demultiplexer( archive_dp, 3, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE ) splits_dp = Filter(splits_dp, path_comparator("name", f"{config.split}{config.fold}.txt")) splits_dp = LineReader(splits_dp, decode=True, return_path=False) splits_dp = Shuffler(splits_dp, buffer_size=INFINITE_BUFFER_SIZE) splits_dp = hint_sharding(splits_dp) joint_categories_dp = CSVParser(joint_categories_dp, delimiter=" ") dp = IterKeyZipper( splits_dp, joint_categories_dp, key_fn=getitem(), ref_key_fn=getitem(0), buffer_size=INFINITE_BUFFER_SIZE, ) dp = IterKeyZipper( dp, images_dp, key_fn=getitem(0), ref_key_fn=self._image_key_fn, buffer_size=INFINITE_BUFFER_SIZE, ) return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: archive_dp = resource_dps[0] split_dp, images_dp, anns_dp = Demultiplexer( archive_dp, 3, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE, ) split_dp = Filter(split_dp, functools.partial(self._is_in_folder, name=self._split_folder)) split_dp = Filter(split_dp, path_comparator("name", f"{self._split}.txt")) split_dp = LineReader(split_dp, decode=True) split_dp = hint_shuffling(split_dp) split_dp = hint_sharding(split_dp) dp = split_dp for level, data_dp in enumerate((images_dp, anns_dp)): dp = IterKeyZipper( dp, data_dp, key_fn=getitem(*[0] * level, 1), ref_key_fn=path_accessor("stem"), buffer_size=INFINITE_BUFFER_SIZE, ) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, ) -> IterDataPipe[Dict[str, Any]]: archive_dp, extra_split_dp = resource_dps archive_dp = resource_dps[0] split_dp, images_dp, anns_dp = Demultiplexer( archive_dp, 3, self._classify_archive, buffer_size=INFINITE_BUFFER_SIZE, drop_none=True, ) if config.split == "train_noval": split_dp = extra_split_dp split_dp = Filter(split_dp, path_comparator("name", f"{config.split}.txt")) split_dp = LineReader(split_dp, decode=True) split_dp = hint_sharding(split_dp) split_dp = hint_shuffling(split_dp) dp = split_dp for level, data_dp in enumerate((images_dp, anns_dp)): dp = IterKeyZipper( dp, data_dp, key_fn=getitem(*[0] * level, 1), ref_key_fn=path_accessor("stem"), buffer_size=INFINITE_BUFFER_SIZE, ) return Mapper(dp, self._prepare_sample)
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: archive_dp = resource_dps[0] images_dp, scenes_dp = Demultiplexer( archive_dp, 2, self._classify_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE, ) images_dp = Filter(images_dp, path_comparator("parent.name", self._split)) images_dp = hint_shuffling(images_dp) images_dp = hint_sharding(images_dp) if self._split != "test": scenes_dp = Filter(scenes_dp, path_comparator("name", f"CLEVR_{self._split}_scenes.json")) scenes_dp = JsonParser(scenes_dp) scenes_dp = Mapper(scenes_dp, getitem(1, "scenes")) scenes_dp = UnBatcher(scenes_dp) dp = IterKeyZipper( images_dp, scenes_dp, key_fn=path_accessor("name"), ref_key_fn=getitem("image_filename"), buffer_size=INFINITE_BUFFER_SIZE, ) else: dp = Mapper(images_dp, self._add_empty_anns) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: if config.split == "train": images_dp, ann_dp = Demultiplexer(resource_dps[0], 2, self._classify_train_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE) else: images_dp, ann_dp = resource_dps images_dp = Filter(images_dp, path_comparator("suffix", ".ppm")) # The order of the image files in the the .zip archives perfectly match the order of the entries in # the (possibly concatenated) .csv files. So we're able to use Zipper here instead of a IterKeyZipper. ann_dp = CSVDictParser(ann_dp, delimiter=";") dp = Zipper(images_dp, ann_dp) dp = hint_sharding(dp) dp = hint_shuffling(dp) dp = Mapper(dp, partial(self._collate_and_decode, decoder=decoder)) return dp
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: images_dp, anns_dp = resource_dps images_dp = Filter(images_dp, self._is_not_background_image) images_dp = hint_sharding(images_dp) images_dp = hint_shuffling(images_dp) anns_dp = Filter(anns_dp, self._is_ann) dp = IterKeyZipper( images_dp, anns_dp, key_fn=self._images_key_fn, ref_key_fn=self._anns_key_fn, buffer_size=INFINITE_BUFFER_SIZE, keep_key=True, ) return Mapper( dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
def from_data_folder( root: Union[str, pathlib.Path], *, decoder: Optional[Callable[[io.IOBase], torch.Tensor]] = None, valid_extensions: Optional[Collection[str]] = None, recursive: bool = True, ) -> Tuple[IterDataPipe, List[str]]: root = pathlib.Path(root).expanduser().resolve() categories = sorted(entry.name for entry in os.scandir(root) if entry.is_dir()) masks: Union[List[str], str] = [f"*.{ext}" for ext in valid_extensions ] if valid_extensions is not None else "" dp = FileLister(str(root), recursive=recursive, masks=masks) dp: IterDataPipe = Filter( dp, functools.partial(_is_not_top_level_file, root=root)) dp = hint_sharding(dp) dp = Shuffler(dp, buffer_size=INFINITE_BUFFER_SIZE) dp = FileOpener(dp, mode="rb") return ( Mapper( dp, functools.partial(_collate_and_decode_data, root=root, categories=categories, decoder=decoder)), categories, )
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = Mapper(dp, self._read_images_and_labels) dp = UnBatcher(dp) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _datapipe( self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _datapipe( self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: dp = Decompressor(resource_dps[0]) dp = LineReader(dp, decode=True, return_path=False) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig ) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = Filter(dp, path_comparator("parent.parent.name", self._SPLIT_NAME_MAPPER[config.split])) dp = hint_sharding(dp) dp = hint_shuffling(dp) return Mapper(dp, self._prepare_sample)
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = Filter(dp, self._is_data_file) dp = Mapper(dp, self._unpickle) dp = CifarFileReader(dp, labels_key=self._LABELS_KEY) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _datapipe( self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = Filter( dp, path_comparator("parent.parent.name", self._split_folder_name)) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _make_datapipe(self, resource_dps: List[IterDataPipe], *, config: DatasetConfig) -> IterDataPipe[Dict[str, Any]]: if config.split in {"train", "test"}: dp = resource_dps[0] # the train archive is a tar of tars if config.split == "train": dp = TarArchiveReader(dp) dp = hint_sharding(dp) dp = hint_shuffling(dp) dp = Mapper( dp, self._prepare_train_data if config.split == "train" else self._prepare_test_data) else: # config.split == "val": images_dp, devkit_dp = resource_dps meta_dp, label_dp = Demultiplexer(devkit_dp, 2, self._classifiy_devkit, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE) meta_dp = Mapper(meta_dp, self._extract_categories_and_wnids) _, wnids = zip(*next(iter(meta_dp))) label_dp = LineReader(label_dp, decode=True, return_path=False) label_dp = Mapper( label_dp, functools.partial(self._imagenet_label_to_wnid, wnids=wnids)) label_dp: IterDataPipe[Tuple[int, str]] = Enumerator(label_dp, 1) label_dp = hint_sharding(label_dp) label_dp = hint_shuffling(label_dp) dp = IterKeyZipper( label_dp, images_dp, key_fn=getitem(0), ref_key_fn=self._val_test_image_key, buffer_size=INFINITE_BUFFER_SIZE, ) dp = Mapper(dp, self._prepare_val_data) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: images_dp, devkit_dp = resource_dps if config.split == "train": # the train archive is a tar of tars dp = TarArchiveReader(images_dp) dp = hint_sharding(dp) dp = hint_shuffling(dp) dp = Mapper(dp, self._collate_train_data) elif config.split == "val": devkit_dp = Filter( devkit_dp, path_comparator("name", "ILSVRC2012_validation_ground_truth.txt")) devkit_dp = LineReader(devkit_dp, return_path=False) devkit_dp = Mapper(devkit_dp, int) devkit_dp = Enumerator(devkit_dp, 1) devkit_dp = hint_sharding(devkit_dp) devkit_dp = hint_shuffling(devkit_dp) dp = IterKeyZipper( devkit_dp, images_dp, key_fn=getitem(0), ref_key_fn=self._val_test_image_key, buffer_size=INFINITE_BUFFER_SIZE, ) dp = Mapper(dp, self._collate_val_data) else: # config.split == "test" dp = hint_sharding(images_dp) dp = hint_shuffling(dp) dp = Mapper(dp, self._collate_test_data) return Mapper( dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: images_dp, targets_dp = resource_dps if self._split == "train": targets_dp = Filter(targets_dp, path_comparator("name", "cars_train_annos.mat")) targets_dp = StanfordCarsLabelReader(targets_dp) dp = Zipper(images_dp, targets_dp) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: images_dp, targets_dp = resource_dps images_dp = PCAMH5Reader(images_dp, key="x") targets_dp = PCAMH5Reader(targets_dp, key="y") dp = Zipper(images_dp, targets_dp) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, ) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = Filter(dp, self._is_not_rogue_file) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, ) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = CSVParser(dp, delimiter=" ") dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = CSVDictParser(dp) dp = hint_sharding(dp) dp = hint_shuffling(dp) return Mapper(dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: images_dp, anns_dp = resource_dps images_dp = Filter(images_dp, self._filter_images) split_and_classification_dp, segmentations_dp = Demultiplexer( anns_dp, 2, self._classify_anns, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE, ) split_and_classification_dp = Filter( split_and_classification_dp, path_comparator("name", f"{config.split}.txt")) split_and_classification_dp = CSVDictParser( split_and_classification_dp, fieldnames=("image_id", "label", "species"), delimiter=" ") split_and_classification_dp = hint_sharding( split_and_classification_dp) split_and_classification_dp = hint_shuffling( split_and_classification_dp) segmentations_dp = Filter(segmentations_dp, self._filter_segmentations) anns_dp = IterKeyZipper( split_and_classification_dp, segmentations_dp, key_fn=getitem("image_id"), ref_key_fn=path_accessor("stem"), buffer_size=INFINITE_BUFFER_SIZE, ) dp = IterKeyZipper( anns_dp, images_dp, key_fn=getitem(0, "image_id"), ref_key_fn=path_accessor("stem"), buffer_size=INFINITE_BUFFER_SIZE, ) return Mapper( dp, functools.partial(self._collate_and_decode_sample, decoder=decoder))
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, ) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = Filter(dp, functools.partial(self._is_data_file, split=config.split)) dp = Mapper(dp, self._unpickle) dp = CifarFileReader(dp, labels_key=self._LABELS_KEY) dp = hint_sharding(dp) dp = hint_shuffling(dp) return Mapper(dp, self._prepare_sample)
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: dp = resource_dps[0] dp = Filter(dp, functools.partial(self._is_data_file, split=config.split)) dp = Mapper(dp, self._unpickle) dp = CifarFileReader(dp, labels_key=self._LABELS_KEY) dp = hint_sharding(dp) dp = hint_shuffling(dp) return Mapper(dp, functools.partial(self._collate_and_decode, decoder=decoder))
def from_data_folder( root: Union[str, pathlib.Path], *, valid_extensions: Optional[Collection[str]] = None, recursive: bool = True, ) -> Tuple[IterDataPipe, List[str]]: root = pathlib.Path(root).expanduser().resolve() categories = sorted(entry.name for entry in os.scandir(root) if entry.is_dir()) masks: Union[List[str], str] = [f"*.{ext}" for ext in valid_extensions] if valid_extensions is not None else "" dp = FileLister(str(root), recursive=recursive, masks=masks) dp: IterDataPipe = Filter(dp, functools.partial(_is_not_top_level_file, root=root)) dp = hint_sharding(dp) dp = hint_shuffling(dp) dp = FileOpener(dp, mode="rb") return Mapper(dp, functools.partial(_prepare_sample, root=root, categories=categories)), categories
def _datapipe( self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: images_dp, labels_dp = resource_dps start, stop = self.start_and_stop() images_dp = Decompressor(images_dp) images_dp = MNISTFileReader(images_dp, start=start, stop=stop) labels_dp = Decompressor(labels_dp) labels_dp = MNISTFileReader(labels_dp, start=start, stop=stop) dp = Zipper(images_dp, labels_dp) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)
def _make_datapipe(self, resource_dps: List[IterDataPipe], *, config: DatasetConfig) -> IterDataPipe[Dict[str, Any]]: images_dp, labels_dp = resource_dps start, stop = self.start_and_stop(config) images_dp = Decompressor(images_dp) images_dp = MNISTFileReader(images_dp, start=start, stop=stop) labels_dp = Decompressor(labels_dp) labels_dp = MNISTFileReader(labels_dp, start=start, stop=stop) dp = Zipper(images_dp, labels_dp) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, functools.partial(self._prepare_sample, config=config))
def _make_datapipe( self, resource_dps: List[IterDataPipe], *, config: DatasetConfig, decoder: Optional[Callable[[io.IOBase], torch.Tensor]], ) -> IterDataPipe[Dict[str, Any]]: images_dp, targets_dp = resource_dps images_dp = PCAMH5Reader(images_dp, key="x") targets_dp = PCAMH5Reader(targets_dp, key="y") dp = Zipper(images_dp, targets_dp) dp = hint_sharding(dp) dp = hint_shuffling(dp) return Mapper(dp, self._collate_and_decode)
def _datapipe( self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: images_dp, anns_dp = resource_dps images_dp = Filter(images_dp, self._filter_images) split_and_classification_dp, segmentations_dp = Demultiplexer( anns_dp, 2, self._classify_anns, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE, ) split_and_classification_dp = Filter( split_and_classification_dp, path_comparator("name", f"{self._split}.txt")) split_and_classification_dp = CSVDictParser( split_and_classification_dp, fieldnames=("image_id", "label", "species"), delimiter=" ") split_and_classification_dp = hint_shuffling( split_and_classification_dp) split_and_classification_dp = hint_sharding( split_and_classification_dp) segmentations_dp = Filter(segmentations_dp, self._filter_segmentations) anns_dp = IterKeyZipper( split_and_classification_dp, segmentations_dp, key_fn=getitem("image_id"), ref_key_fn=path_accessor("stem"), buffer_size=INFINITE_BUFFER_SIZE, ) dp = IterKeyZipper( anns_dp, images_dp, key_fn=getitem(0, "image_id"), ref_key_fn=path_accessor("stem"), buffer_size=INFINITE_BUFFER_SIZE, ) return Mapper(dp, self._prepare_sample)
def _datapipe(self, resource_dps: List[IterDataPipe]) -> IterDataPipe[Dict[str, Any]]: if self._split == "train": images_dp, ann_dp = Demultiplexer( resource_dps[0], 2, self._classify_train_archive, drop_none=True, buffer_size=INFINITE_BUFFER_SIZE ) else: images_dp, ann_dp = resource_dps images_dp = Filter(images_dp, path_comparator("suffix", ".ppm")) # The order of the image files in the .zip archives perfectly match the order of the entries in the # (possibly concatenated) .csv files. So we're able to use Zipper here instead of a IterKeyZipper. ann_dp = CSVDictParser(ann_dp, delimiter=";") dp = Zipper(images_dp, ann_dp) dp = hint_shuffling(dp) dp = hint_sharding(dp) return Mapper(dp, self._prepare_sample)