def test_dataloader(self): """Summary Returns: TYPE: Description """ ds_test = CustomDataSet(folder=self.hparams.data, train_or_valid='test', size=np.inf, hparams=self.hparams) ds_test.reset_state() ag_test = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST), imgaug.ToFloat32(), ] ds_test = AugmentImageComponent(ds_test, [ imgaug.Albumentations(AB.CLAHE(p=1)), ], 0) ds_test = AugmentImageComponent(ds_test, ag_test, 0) ds_test = BatchData(ds_test, self.hparams.batch, remainder=True) # ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16) ds_test = PrintData(ds_test) ds_test = MapData( ds_test, lambda dp: [ torch.tensor(dp[0][:, np.newaxis, :, :]).float(), torch.tensor(dp[1][:, np.newaxis, :, :]).float() ]) return ds_test
def train_generator(ds, shape_aug=None, input_aug=None, label_aug=None, batch_size=16, nr_procs=8): ### augment both the input and label ds = ds if shape_aug is None else AugmentImageComponents(ds, shape_aug, (0, 1), copy=True) ### augment just the input i.e index 0 within each yield of DatasetSerial ds = ds if input_aug is None else AugmentImageComponent(ds, input_aug, index=0, copy=False) ### augment just the output i.e index 1 within each yield of DatasetSerial ds = ds if label_aug is None else AugmentImageComponent(ds, label_aug, index=1, copy=True) # ds = BatchDataByShape(ds, batch_size, idx=0) ds = PrefetchDataZMQ(ds, nr_procs) return ds
def train_generator_class(ds, shape_aug=None, input_aug=None, batch_size=16, nr_procs=8): ### augment the input ds = ds if shape_aug is None else AugmentImageComponent( ds, shape_aug, index=0, copy=True) ### augment the input i.e index 0 within each yield of DatasetSerial ds = ds if input_aug is None else AugmentImageComponent( ds, input_aug, index=0, copy=False) # ds = BatchDataByShape(ds, batch_size, idx=0) ds = PrefetchDataZMQ(ds, nr_procs) return ds
def valid_generator(ds, shape_aug=None, input_aug=None, label_aug=None, batch_size=16, nr_procs=1): ### augment both the input and label ds = ds if shape_aug is None else AugmentImageComponents(ds, shape_aug, (0, 1), copy=True) ### augment just the input ds = ds if input_aug is None else AugmentImageComponent(ds, input_aug, index=0, copy=False) ### augment just the output ds = ds if label_aug is None else AugmentImageComponent(ds, label_aug, index=1, copy=True) # ds = BatchData(ds, batch_size, remainder=True) ds = CacheData(ds) # cache all inference images return ds
def valid_generator_class(ds, shape_aug=None, input_aug=None, batch_size=16, nr_procs=1): ### augment the input ds = ds if shape_aug is None else AugmentImageComponent( ds, shape_aug, index=0, copy=True) ### augment the input ds = ds if input_aug is None else AugmentImageComponent( ds, input_aug, index=0, copy=False) # ds = BatchData(ds, batch_size, remainder=True) ds = CacheData(ds) # cache all inference images return ds
def val_dataloader(self): """Summary Returns: TYPE: Description """ ds_valid = MultiLabelDataset( folder=self.hparams.data, is_train='valid', fname='covid_test_v5.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), ) ds_valid.reset_state() ag_valid = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA), imgaug.ToFloat32(), ] ds_valid = AugmentImageComponent(ds_valid, ag_valid, 0) ds_valid = BatchData(ds_valid, self.hparams.batch, remainder=True) ds_valid = MultiProcessRunner(ds_valid, num_proc=4, num_prefetch=16) ds_valid = PrintData(ds_valid) ds_valid = MapData( ds_valid, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_valid
def val_dataloader(self): """Summary Returns: TYPE: Description """ ds_valid = CustomDataSet(folder=self.hparams.data, train_or_valid='valid', size=np.inf, hparams=self.hparams) ds_valid.reset_state() ag_valid = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST), imgaug.ToFloat32(), ] ds_valid = AugmentImageComponent(ds_valid, [ imgaug.Albumentations( AB.CLAHE(tile_grid_size=(32, 32), always_apply=True, p=1), ), ], 0) ds_valid = AugmentImageComponents(ds_valid, ag_valid, [0, 1]) ds_valid = BatchData(ds_valid, self.hparams.batch, remainder=True) ds_valid = MultiProcessRunner(ds_valid, num_proc=4, num_prefetch=16) ds_valid = PrintData(ds_valid) ds_valid = MapData( ds_valid, lambda dp: [ torch.tensor(dp[0][:, np.newaxis, :, :]).float(), torch.tensor(dp[1][:, np.newaxis, :, :]).float(), ]) return ds_valid
def test_dataloader(self): """Summary Returns: TYPE: Description """ ds_test = MultiLabelDataset(folder=self.hparams.data_path, is_train='test', fname='test.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape)) ds_test.reset_state() ag_test = [ imgaug.Albumentations( AB.SmallestMaxSize(self.hparams.shape, p=1.0)), iimgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB), imgaug.Albumentations(AB.CLAHE(p=1)), imgaug.ToFloat32(), ] ds_test = AugmentImageComponent(ds_test, ag_test, 0) ds_test = BatchData(ds_test, self.hparams.batch, remainder=True) # ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16) ds_test = PrintData(ds_test) ds_test = MapData(ds_test, lambda dp: [torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float()]) return ds_test
def get_resnet_train_dataflow(): imgs = ResnetDetection.load_many( config.BASEDIR, config.TRAIN_DATASET) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. imgs = list(imgs) ds = DataFromList(imgs, shuffle=True) augmentors = get_resnet_augmentor() def preprocess(img): im, fname, label = img['image_data'], img['id'], img['with_ship'] im = cv2.imread(im) #============Aug================ im = cv2.resize(im, (config.RESNET_SIZE, config.RESNET_SIZE)) augmented = strong_aug()(image=im) im = augmented['image'] # im, multi_mask = do_flip_transpose2(im, multi_mask, type=random.randint(0,7)) #============================ ret = [im, label] return ret ds = MapData(ds, preprocess) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = BatchData(ds, config.RESNET_BATCH) ds = PrefetchDataZMQ(ds, 6) return ds
def get_downsampled_imagenet_augmented_data(subset, options, do_multiprocess=True, do_validation=False, shuffle=None): isTrain = subset == 'train' and do_multiprocess shuffle = shuffle if shuffle is not None else isTrain reret = re.search(r'^imagenet([0-9]*)$', options.ds_name) input_size = int(reret.group(1)) ds = DownsampledImageNet(_data_batch_dir(options.data_dir, input_size),\ subset, shuffle, input_size, do_validation=do_validation) pp_mean = ds.mean_img paste_size = ds.input_size * 5 // 4 crop_size = ds.input_size if isTrain: augmentors = [ imgaug.CenterPaste((paste_size, paste_size)), imgaug.RandomCrop((crop_size, crop_size)), imgaug.Flip(horiz=True), imgaug.MapImage(lambda x: (x - pp_mean)/128.0), ] else: augmentors = [ imgaug.MapImage(lambda x: (x - pp_mean)/128.0) ] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) if do_multiprocess: ds = PrefetchData(ds, 4, 2) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' cpu = min(30, multiprocessing.cpu_count()) meta_dir = './ilsvrc_metadata' if isTrain: ds = dataset.ILSVRC12(datadir, name, meta_dir=meta_dir, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = PrefetchDataZMQ(ds, cpu) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, meta_dir=meta_dir, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, cpu, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def test_dataloader(self): ds_test = MultiLabelDataset(folder=self.hparams.data, is_train='valid', fname='covid_test_v5.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), fold_idx=None, n_folds=1) ds_test.reset_state() ag_test = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA), imgaug.ToFloat32(), ] ds_test = AugmentImageComponent(ds_test, ag_test, 0) ds_test = BatchData(ds_test, self.hparams.batch, remainder=True) ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16) ds_test = PrintData(ds_test) ds_test = MapData( ds_test, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_test
def get_iNaturalist_dataflow( datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.iNaturalist(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn("DataFlow may become the bottleneck when too few processes are used.") ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.iNaturalistFiles(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def train_dataloader(self): ds_train = MultiLabelDataset(folder=self.hparams.data, is_train='train', fname='covid_train_v5.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), balancing=None) ds_train.reset_state() ag_train = [ # imgaug.Albumentations( # AB.SmallestMaxSize(self.hparams.shape, p=1.0)), imgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB), # imgaug.Affine(shear=10), imgaug.RandomChooseAug([ imgaug.Albumentations(AB.Blur(blur_limit=4, p=0.25)), imgaug.Albumentations(AB.MotionBlur(blur_limit=4, p=0.25)), imgaug.Albumentations(AB.MedianBlur(blur_limit=4, p=0.25)), ]), imgaug.Albumentations(AB.CLAHE(tile_grid_size=(32, 32), p=0.5)), imgaug.RandomOrderAug([ imgaug.Affine(shear=10, border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), imgaug.Affine(translate_frac=(0.01, 0.02), border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), imgaug.Affine(scale=(0.5, 1.0), border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), ]), imgaug.RotationAndCropValid(max_deg=10, interp=cv2.INTER_AREA), imgaug.GoogleNetRandomCropAndResize( crop_area_fraction=(0.8, 1.0), aspect_ratio_range=(0.8, 1.2), interp=cv2.INTER_AREA, target_shape=self.hparams.shape), imgaug.ColorSpace(mode=cv2.COLOR_RGB2GRAY), imgaug.ToFloat32(), ] ds_train = AugmentImageComponent(ds_train, ag_train, 0) # Label smoothing ag_label = [ imgaug.BrightnessScale((0.8, 1.2), clip=False), ] # ds_train = AugmentImageComponent(ds_train, ag_label, 1) ds_train = BatchData(ds_train, self.hparams.batch, remainder=True) if self.hparams.debug: ds_train = FixedSizeData(ds_train, 2) ds_train = MultiProcessRunner(ds_train, num_proc=4, num_prefetch=16) ds_train = PrintData(ds_train) ds_train = MapData( ds_train, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_train
def get_data(batch, augmentors): """ Sec 3, Remark 4: Use a single random shuffling of the training data (per epoch) that is divided amongst all k workers. Here we do not follow the paper because it does not seem to make a difference. """ ds = dataset.ILSVRC12(args.data, 'train', shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = BatchData(ds, batch, remainder=False) ds = PrefetchDataZMQ(ds, min(50, mp.cpu_count())) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count()) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp jpeg_filename = os.path.basename(fname) jpeg_dirname = os.path.basename(os.path.dirname(fname)) zip_filepath = os.path.dirname(fname) + '.zip' f = zipfile.ZipFile(zip_filepath, 'r') compress_jpeg = np.fromstring(f.read( os.path.join(jpeg_dirname, jpeg_filename)), dtype=np.uint8) im = cv2.imdecode(compress_jpeg, cv2.IMREAD_COLOR) #im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_data(batch, augmentors, workers): """ Sec 3, Remark 4: Use a single random shuffling of the training data (per epoch) that is divided amongst all k workers. NOTE: Here we do not follow the paper, but it makes little differences. """ ds = dataset.ILSVRC12(args.data, 'train', shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = PrefetchDataZMQ(ds, workers) ds = BatchData(ds, batch, remainder=False) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors=None, parallel=None): """ Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] isTrain = name == 'train' assert datadir is not None if augmentors is None: augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): #获取图像网络数据流 """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: # 如果不是并行的话 parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading 超线程? 获取当前计算机cpu数量 if isTrain: # dataset:创建一个在数据流上运行的预测器,并且拿出一个batch? ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) # 使用共享的增强参数在多个组件上应用图像增强器 if parallel < 16: # 如果少于16个的话 logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) # 实现高效的数据流水线 ds = BatchData(ds, batch_size, remainder=False) # 取一个batch? else: # 如果是测试时,增强图像,加速对数据流的读取操作等 # 与ILSVRC12相同,但生成图像的文件名而不是np array。 ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR ) # cv2.IMREAD_COLOR : 默认使用该种标识。加载一张彩色图片,忽视它的透明度 im = aug.augment(im) # 增强图像 return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) # 并行加速? ds = BatchData(ds, batch_size, remainder=True) # 取一个batch? ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 6) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warning( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchData(ds, 1000, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(30): try: im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) break except Exception as e: logger.warning(str(e), 'file=', fname) time.sleep(1) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchData(ds, 100, 1) return ds
def generate_dataflow(dataset, option): if option['number_of_cores'] == -1: option['number_of_cores'] = mp.cpu_count() ds = DataFlow(dataset, option) ds = AugmentImageComponent(ds, option['augmentors'], copy = False) if option['number_of_cores'] < 16: print('[!} Warning = DataFlow may become the bottleneck when too few processes are used.') ds = PrefetchData(ds, option['num_prefetch_for_dataset'], option['number_of_cores']) ds = BatchData(ds, option['batch_size'], remainder = option['remainder']) ds = PrefetchData(ds, option['num_prefetch_for_batch'], 2) return ds
def get_train_dataflow(datadir, batch, augmentors=None): """ Sec 3, Remark 4: Use a single random shuffling of the training data (per epoch) that is divided amongst all k workers. NOTE: Here we do not follow the paper which makes some differences. Here, each machine shuffles independently. """ if augmentors is None: augmentors = fbresnet_augmentor(True) ds = dataset.ILSVRC12(datadir, 'train', shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = BatchData(ds, batch, remainder=False) ds = MultiProcessRunnerZMQ(ds, min(50, mp.cpu_count())) return ds
def get_imagenet_dataflow(datadir, is_train, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert datadir is not None assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if is_train: ds = dataset.ILSVRC12(datadir, "train", shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logging.warning( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, "val", shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = np.flip(im, axis=2) # print("fname={}".format(fname)) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) # ds = MapData(ds, mapf) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) # ds = PrefetchData(ds, 1) return ds
def get_data(self, name, num_gpu): gpu_batch = self.batch_size // num_gpu assert name in ['train', 'val', 'test'] isTrain = name == 'train' augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(self.datadir, name, shuffle=True, dir_structure='train') ds = AugmentImageComponent(ds, augmentors, copy=False) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, gpu_batch, remainder=False) #ds = QueueInput(ds) else: ds = dataset.ILSVRC12Files(self.datadir, name, shuffle=False, dir_structure='train') aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, gpu_batch, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) if num_gpu == 1: ds = QueueInput(ds) return ds
def get_data(train_or_test, option): isTrain = train_or_test == 'train' datadir = option.data if option.final_size == 64: ds = dataset.tinyImagenetHaS(datadir, train_or_test, 'all', shuffle=isTrain) elif option.final_size == 224: ds = dataset.ILSVRC12(datadir, train_or_test, shuffle=isTrain) augmentors = fbresnet_augmentor(isTrain, option=option) augmentors.append(imgaug.ToUint8()) ds = AugmentImageComponent(ds, augmentors, copy=False) if isTrain: ds = PrefetchDataZMQ(ds, min(25, multiprocessing.cpu_count())) ds = BatchData(ds, int(option.batch), remainder=not isTrain) return ds
def get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel=None): """ Load a Single-File LMDB (Sequential Read) Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A LMDBData which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert isinstance(augmentors, list) aug = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = LocallyShuffleData(ds, 50000) ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = AugmentImageComponent(ds, aug, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = BatchData(ds, batch_size, remainder=False, use_list=True) ds = MultiProcessRunnerZMQ(ds, parallel) else: def mapper(data): im, label = data im = cv2.imdecode(im, cv2.IMREAD_COLOR) im = aug.augment(im) return im, label ds = MultiProcessMapDataZMQ(ds, parallel, mapper, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True, use_list=True) return ds
def get_cifar_augmented_data(subset, options, do_multiprocess=True, do_validation=False, shuffle=None): isTrain = subset == 'train' and do_multiprocess shuffle = shuffle if shuffle is not None else isTrain if options.num_classes == 10 and options.ds_name == 'cifar10': ds = dataset.Cifar10(subset, shuffle=shuffle, do_validation=do_validation) cutout_length = 16 n_holes = 1 elif options.num_classes == 100 and options.ds_name == 'cifar100': ds = dataset.Cifar100(subset, shuffle=shuffle, do_validation=do_validation) cutout_length = 8 n_holes = 1 else: raise ValueError( 'Number of classes must be set to 10(default) or 100 for CIFAR') logger.info('{} set has n_samples: {}'.format(subset, len(ds.data))) pp_mean = ds.get_per_pixel_mean() if isTrain: logger.info('Will do cut-out with length={} n_holes={}'.format( cutout_length, n_holes)) augmentors = [ imgaug.CenterPaste((40, 40)), imgaug.RandomCrop((32, 32)), imgaug.Flip(horiz=True), imgaug.MapImage(lambda x: (x - pp_mean) / 128.0), Cutout(length=cutout_length, n_holes=n_holes), ] else: augmentors = [imgaug.MapImage(lambda x: (x - pp_mean) / 128.0)] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) if do_multiprocess: ds = PrefetchData(ds, 3, 2) return ds
def get_random_loader(ds, isTrain, batch_size, augmentors, parallel=None): """ DataFlow data (Random Read) Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert isinstance(augmentors, list) aug = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = AugmentImageComponent(ds, aug, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def get_data(self, train_or_test): isTrain = train_or_test == 'train' ds = dataset.Cifar10(train_or_test, dir='.') pp_mean = ds.get_per_pixel_mean() if isTrain: augmentors = [ imgaug.CenterPaste((40, 40)), imgaug.RandomCrop((32, 32)), imgaug.Flip(horiz=True), # imgaug.Brightness(20), # imgaug.Contrast((0.6,1.4)), imgaug.MapImage(lambda x: x - pp_mean), ] else: augmentors = [imgaug.MapImage(lambda x: x - pp_mean)] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, self.batch_size, remainder=not isTrain) if isTrain: ds = PrefetchData(ds, 3, 2) return ds
def get_inat_augmented_data(subset, options, lmdb_dir=None, year='2018', do_multiprocess=True, do_validation=False, is_train=None, shuffle=None, n_allow=None): input_size = options.input_size if options.input_size else 224 isTrain = is_train if is_train is not None else (subset == 'train' and do_multiprocess) shuffle = shuffle if shuffle is not None else isTrain postfix = "" if n_allow is None else "_allow_{}".format(n_allow) #TODO: Parameterize the cv split to be consider #Currently hardcoding to 1 cv = 1 # When do_validation is True it will expect *cv_train and *cv_val lmdbs # Currently the cv_train split is always used if isTrain: postfix += '_cv_train_{}'.format(cv) elif do_validation: subset = 'train' postfix += '_cv_val_{}'.format(cv) if lmdb_dir == None: lmdb_path = os.path.join(options.data_dir, 'inat_lmdb', 'inat2018_{}{}.lmdb'.format(subset, postfix)) else: lmdb_path = os.path.join( options.data_dir, lmdb_dir, 'inat{}_{}{}.lmdb'.format(year, subset, postfix)) ds = LMDBData(lmdb_path, shuffle=False) if shuffle: ds = LocallyShuffleData(ds, 1024 * 80) # This is 64G~80G in memory images ds = PrefetchData(ds, 1024 * 8, 1) # prefetch around 8 G ds = LMDBDataPoint(ds) ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) # BGR uint8 data if isTrain: class Resize(imgaug.ImageAugmentor): """ crop 8%~100% of the original image See `Going Deeper with Convolutions` by Google. """ def _augment(self, img, _): h, w = img.shape[:2] area = h * w for _ in range(10): targetArea = self.rng.uniform(0.08, 1.0) * area aspectR = self.rng.uniform(0.75, 1.333) ww = int(np.sqrt(targetArea * aspectR)) hh = int(np.sqrt(targetArea / aspectR)) if self.rng.uniform() < 0.5: ww, hh = hh, ww if hh <= h and ww <= w: x1 = 0 if w == ww else self.rng.randint(0, w - ww) y1 = 0 if h == hh else self.rng.randint(0, h - hh) out = img[y1:y1 + hh, x1:x1 + ww] out = cv2.resize(out, (input_size, input_size), interpolation=cv2.INTER_CUBIC) return out out = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC) return out augmentors = [ Resize(), imgaug.RandomOrderAug([ imgaug.Brightness(30, clip=False), imgaug.Contrast((0.8, 1.2), clip=False), imgaug.Saturation(0.4), # rgb-bgr conversion imgaug.Lighting(0.1, eigval=[0.2175, 0.0188, 0.0045][::-1], eigvec=np.array([[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]], dtype='float32')[::-1, ::-1]) ]), imgaug.Clip(), imgaug.Flip(horiz=True), imgaug.ToUint8() ] else: augmentors = [ imgaug.ResizeShortestEdge(256), imgaug.CenterCrop((input_size, input_size)), imgaug.ToUint8() ] ds = AugmentImageComponent(ds, augmentors, copy=False) if do_multiprocess: ds = PrefetchDataZMQ(ds, min(24, multiprocessing.cpu_count())) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) return ds