def get_imagenet_dataflow(datadir, name, batch_size, augmentors): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' cpu = min(30, multiprocessing.cpu_count()) meta_dir = './ilsvrc_metadata' if isTrain: ds = dataset.ILSVRC12(datadir, name, meta_dir=meta_dir, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = PrefetchDataZMQ(ds, cpu) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, meta_dir=meta_dir, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, cpu, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow( datadir, name, batch_size, augmentors=None, parallel=None): """ Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. """ assert name in ['train', 'val', 'test'] isTrain = name == 'train' assert datadir is not None if augmentors is None: augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) augmentors = AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading def mapf(dp): fname, label = dp img = cv2.imread(fname) img = augmentors.augment(img) return img, label if isTrain: ds = dataset.ILSVRC12Files(datadir, name, shuffle=True) ds = MultiProcessMapDataZMQ(ds, parallel, mapf, buffer_size=2000) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_iNaturalist_dataflow( datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.iNaturalist(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn("DataFlow may become the bottleneck when too few processes are used.") ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.iNaturalistFiles(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_data(): def f(dp): im = dp[0][:, :, None] onehot = np.eye(10)[dp[1]] return [im, onehot] train = BatchData(MapData(dataset.Mnist('train'), f), 128) test = BatchData(MapData(dataset.Mnist('test'), f), 256) return train, test
def get_data(): def f(dp): im = dp[0][:, :, None] onehot = np.zeros(10, dtype='int32') onehot[dp[1]] = 1 return [im, onehot] train = BatchData(MapData(dataset.Mnist('train'), f), 128) test = BatchData(MapData(dataset.Mnist('test'), f), 256) return train, test
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count()) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp jpeg_filename = os.path.basename(fname) jpeg_dirname = os.path.basename(os.path.dirname(fname)) zip_filepath = os.path.dirname(fname) + '.zip' f = zipfile.ZipFile(zip_filepath, 'r') compress_jpeg = np.fromstring(f.read( os.path.join(jpeg_dirname, jpeg_filename)), dtype=np.uint8) im = cv2.imdecode(compress_jpeg, cv2.IMREAD_COLOR) #im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors=None, parallel=None): """ Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A DataFlow which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] isTrain = name == 'train' assert datadir is not None if augmentors is None: augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): #获取图像网络数据流 """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: # 如果不是并行的话 parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading 超线程? 获取当前计算机cpu数量 if isTrain: # dataset:创建一个在数据流上运行的预测器,并且拿出一个batch? ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) # 使用共享的增强参数在多个组件上应用图像增强器 if parallel < 16: # 如果少于16个的话 logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) # 实现高效的数据流水线 ds = BatchData(ds, batch_size, remainder=False) # 取一个batch? else: # 如果是测试时,增强图像,加速对数据流的读取操作等 # 与ILSVRC12相同,但生成图像的文件名而不是np array。 ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR ) # cv2.IMREAD_COLOR : 默认使用该种标识。加载一张彩色图片,忽视它的透明度 im = aug.augment(im) # 增强图像 return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) # 并行加速? ds = BatchData(ds, batch_size, remainder=True) # 取一个batch? ds = PrefetchDataZMQ(ds, 1) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 6) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warning( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchData(ds, 1000, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(30): try: im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) break except Exception as e: logger.warning(str(e), 'file=', fname) time.sleep(1) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchData(ds, 100, 1) return ds
def make_data(self, ds,is_training=True): if is_training: ds = MultiThreadMapData(ds, 10, self.train_map_func, buffer_size=200, strict=True) else: ds = MultiThreadMapData(ds, 5, self.val_map_func, buffer_size=200, strict=True) ds = BatchData(ds, cfg.TRAIN.num_gpu * cfg.TRAIN.batch_size, remainder=True,use_list=False) ds = MultiProcessPrefetchData(ds, 100,2) ds.reset_state() ds=ds.get_data() ########### # ds = data_set.shuffle(buffer_size=512) # shuffle before loading images # ds = ds.repeat(cfg.TRAIN.epoch) # if is_training: # ds = ds.map(self.train_map_func, num_parallel_calls=multiprocessing.cpu_count()) # decouple the heavy map_fn # else: # ds = ds.map(self.val_map_func, num_parallel_calls=multiprocessing.cpu_count()) # decouple the heavy map_fn # ds = ds.batch( # cfg.TRAIN.num_gpu * cfg.TRAIN.batch_size) # TODO: consider using tf.contrib.map_and_batch # # ds = ds.prefetch(5 * cfg.TRAIN.num_gpu) # iterator = ds.make_one_shot_iterator() # one_element = iterator.get_next() # images, labels = one_element return ds
def get_imagenet_dataflow(datadir, name, batch_size, parallel=None): """ Get a standard imagenet training/evaluation dataflow, for linear classifier tuning. """ assert name in ['train', 'val'] isTrain = name == 'train' assert datadir is not None augmentors = get_basic_augmentor(isTrain) augmentors = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(50, mp.cpu_count()) def mapper(dp): fname, label = dp img = cv2.imread(fname) img = augmentors.augment(img) return img, label if isTrain: ds = dataset.ILSVRC12Files(datadir, name, shuffle=True) ds = MultiProcessMapAndBatchDataZMQ(ds, parallel, mapper, batch_size, buffer_size=7000) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) ds = MultiThreadMapData(ds, parallel, mapper, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) return ds
def get_val_dataflow( datadir, batch_size, augmentors, parallel=None, num_splits=None, split_index=None): assert datadir is not None assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count()) if num_splits is None: ds = dataset.ILSVRC12Files(datadir, 'val', shuffle=False) else: assert split_index < num_splits files = dataset.ILSVRC12Files(datadir, 'val', shuffle=False) files.reset_state() files = list(files.get_data()) logger.info("#ValidationData = {}".format(len(files))) split_size = len(files) // num_splits start, end = split_size * split_index, split_size * (split_index + 1) end = min(end, len(files)) logger.info("#ValidationSplit = {} - {}".format(start, end)) files = files[start: end] ds = DataFromList(files, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) # ds = PrefetchDataZMQ(ds, 1) # do not fork() under MPI return ds
def val_dataloader(self): """Summary Returns: TYPE: Description """ ds_valid = MultiLabelDataset( folder=self.hparams.data, is_train='valid', fname='covid_test_v5.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), ) ds_valid.reset_state() ag_valid = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA), imgaug.ToFloat32(), ] ds_valid = AugmentImageComponent(ds_valid, ag_valid, 0) ds_valid = BatchData(ds_valid, self.hparams.batch, remainder=True) ds_valid = MultiProcessRunner(ds_valid, num_proc=4, num_prefetch=16) ds_valid = PrintData(ds_valid) ds_valid = MapData( ds_valid, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_valid
def test_dataloader(self): """Summary Returns: TYPE: Description """ ds_test = MultiLabelDataset(folder=self.hparams.data_path, is_train='test', fname='test.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape)) ds_test.reset_state() ag_test = [ imgaug.Albumentations( AB.SmallestMaxSize(self.hparams.shape, p=1.0)), iimgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB), imgaug.Albumentations(AB.CLAHE(p=1)), imgaug.ToFloat32(), ] ds_test = AugmentImageComponent(ds_test, ag_test, 0) ds_test = BatchData(ds_test, self.hparams.batch, remainder=True) # ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16) ds_test = PrintData(ds_test) ds_test = MapData(ds_test, lambda dp: [torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float()]) return ds_test
def test_dataloader(self): ds_test = MultiLabelDataset(folder=self.hparams.data, is_train='valid', fname='covid_test_v5.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), fold_idx=None, n_folds=1) ds_test.reset_state() ag_test = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_AREA), imgaug.ToFloat32(), ] ds_test = AugmentImageComponent(ds_test, ag_test, 0) ds_test = BatchData(ds_test, self.hparams.batch, remainder=True) ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16) ds_test = PrintData(ds_test) ds_test = MapData( ds_test, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_test
def get_downsampled_imagenet_augmented_data(subset, options, do_multiprocess=True, do_validation=False, shuffle=None): isTrain = subset == 'train' and do_multiprocess shuffle = shuffle if shuffle is not None else isTrain reret = re.search(r'^imagenet([0-9]*)$', options.ds_name) input_size = int(reret.group(1)) ds = DownsampledImageNet(_data_batch_dir(options.data_dir, input_size),\ subset, shuffle, input_size, do_validation=do_validation) pp_mean = ds.mean_img paste_size = ds.input_size * 5 // 4 crop_size = ds.input_size if isTrain: augmentors = [ imgaug.CenterPaste((paste_size, paste_size)), imgaug.RandomCrop((crop_size, crop_size)), imgaug.Flip(horiz=True), imgaug.MapImage(lambda x: (x - pp_mean)/128.0), ] else: augmentors = [ imgaug.MapImage(lambda x: (x - pp_mean)/128.0) ] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) if do_multiprocess: ds = PrefetchData(ds, 4, 2) return ds
def data_to_dataflow(x, y, config: DotMap) -> DataFlow: dataflow = data_to_generator(x, y, True) dataflow = GeneratorToDataFlow(dataflow) dataflow = BatchData(dataflow, config.trainer.batch_size) dataflow.reset_state() return dataflow
def test_dataloader(self): """Summary Returns: TYPE: Description """ ds_test = CustomDataSet(folder=self.hparams.data, train_or_valid='test', size=np.inf, hparams=self.hparams) ds_test.reset_state() ag_test = [ imgaug.Resize(self.hparams.shape, interp=cv2.INTER_NEAREST), imgaug.ToFloat32(), ] # ds_test = AugmentImageComponent(ds_test, [imgaug.Albumentations(AB.CLAHE(tile_grid_size=(32, 32), always_apply=True, p=1)),], 0) ds_test = AugmentImageComponents(ds_test, ag_test, [0, 1]) ds_test = BatchData(ds_test, self.hparams.batch, remainder=True) ds_test = MultiProcessRunner(ds_test, num_proc=4, num_prefetch=16) ds_test = PrintData(ds_test) ds_test = MapData( ds_test, lambda dp: [ torch.tensor(dp[0][:, np.newaxis, :, :]).float(), torch.tensor(dp[1][:, np.newaxis, :, :]).float(), ]) return ds_test
def get_simple_val_dataflow(src, batch_size=32, augmentors=None, parallel=None): if augmentors is None: augmentors = fbresnet_augmentor(False) assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count()) aug = imgaug.AugmentorList(augmentors) def mapf(dp): im, cls = dp im_new = im * 255 im_new = im_new[:, :, ::-1] im_new = im_new.astype('uint8') return im_new, cls im_new = aug.augment(im_new) return im_new, cls ds = MultiThreadMapData(src, parallel, mapf, buffer_size=min(2000, src.size()), strict=True) ds = BatchData(ds, batch_size, remainder=True) # do not fork() under MPI return ds
def get_resnet_train_dataflow(): imgs = ResnetDetection.load_many( config.BASEDIR, config.TRAIN_DATASET) # Valid training images should have at least one fg box. # But this filter shall not be applied for testing. imgs = list(imgs) ds = DataFromList(imgs, shuffle=True) augmentors = get_resnet_augmentor() def preprocess(img): im, fname, label = img['image_data'], img['id'], img['with_ship'] im = cv2.imread(im) #============Aug================ im = cv2.resize(im, (config.RESNET_SIZE, config.RESNET_SIZE)) augmented = strong_aug()(image=im) im = augmented['image'] # im, multi_mask = do_flip_transpose2(im, multi_mask, type=random.randint(0,7)) #============================ ret = [im, label] return ret ds = MapData(ds, preprocess) ds = AugmentImageComponent(ds, augmentors, copy=False) ds = BatchData(ds, config.RESNET_BATCH) ds = PrefetchDataZMQ(ds, 6) return ds
def data_orig(num_readers=16, batch_size=16, is_vis=False, is_print=False): # we may not need to process t1_start = time() dr = data_raw(video_dir, data_dir, is_print=True) df = MyDataFlow(raw_data=dr, num_steps=5, is_training=True) # df = MultiProcessMapDataZMQ(df, 4, preprocess) df = BatchData(df, batch_size) df = PrefetchDataZMQ(df, num_readers) # df = PlasmaGetData(df) df.reset_state() t1_end = time() print("data loader preparation costs {} seconds".format(t1_end - t1_start)) step = 0 time_start = time() for datapoint in df: if is_print is True: for j in range(len(datapoint)): print(datapoint[j].shape) if is_vis is True: array_show(datapoint) step = step + 1 print("now passed {} seconds".format(time() - t1_end)) average_time = (time.time() - time_start) / 10 return average_time
def get_imagenet_dataflow(datadir, is_train, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert datadir is not None assert isinstance(augmentors, list) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if is_train: ds = dataset.ILSVRC12(datadir, "train", shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logging.warning( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, "val", shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = np.flip(im, axis=2) # print("fname={}".format(fname)) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) # ds = MapData(ds, mapf) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchDataZMQ(ds, 1) # ds = PrefetchData(ds, 1) return ds
def build_iter(self): ds = DataFromGenerator(self.generator) ds = BatchData(ds, self.batch_size) ds = MultiProcessPrefetchData(ds, self.prefetch_size, self.process_num) ds.reset_state() ds = ds.get_data() return ds
def get_data(self, name, num_gpu): gpu_batch = self.batch_size // num_gpu assert name in ['train', 'val', 'test'] isTrain = name == 'train' augmentors = fbresnet_augmentor(isTrain) assert isinstance(augmentors, list) parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = dataset.ILSVRC12(self.datadir, name, shuffle=True, dir_structure='train') ds = AugmentImageComponent(ds, augmentors, copy=False) ds = MultiProcessRunnerZMQ(ds, parallel) ds = BatchData(ds, gpu_batch, remainder=False) #ds = QueueInput(ds) else: ds = dataset.ILSVRC12Files(self.datadir, name, shuffle=False, dir_structure='train') aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, gpu_batch, remainder=True) ds = MultiProcessRunnerZMQ(ds, 1) if num_gpu == 1: ds = QueueInput(ds) return ds
def prepared(self, num_gpu, batch_size, eval=False): # use a single process version to debug if needed if self.min_num_workers == 0: ds = MapData(self, self.ex_process.train_process) else: ds = MultiProcessMapData(self, max(num_gpu, self.min_num_workers), self.ex_process.train_process) return BatchData(ds, batch_size)
def train_dataloader(self): ds_train = MultiLabelDataset(folder=self.hparams.data, is_train='train', fname='covid_train_v5.csv', types=self.hparams.types, pathology=self.hparams.pathology, resize=int(self.hparams.shape), balancing=None) ds_train.reset_state() ag_train = [ # imgaug.Albumentations( # AB.SmallestMaxSize(self.hparams.shape, p=1.0)), imgaug.ColorSpace(mode=cv2.COLOR_GRAY2RGB), # imgaug.Affine(shear=10), imgaug.RandomChooseAug([ imgaug.Albumentations(AB.Blur(blur_limit=4, p=0.25)), imgaug.Albumentations(AB.MotionBlur(blur_limit=4, p=0.25)), imgaug.Albumentations(AB.MedianBlur(blur_limit=4, p=0.25)), ]), imgaug.Albumentations(AB.CLAHE(tile_grid_size=(32, 32), p=0.5)), imgaug.RandomOrderAug([ imgaug.Affine(shear=10, border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), imgaug.Affine(translate_frac=(0.01, 0.02), border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), imgaug.Affine(scale=(0.5, 1.0), border=cv2.BORDER_CONSTANT, interp=cv2.INTER_AREA), ]), imgaug.RotationAndCropValid(max_deg=10, interp=cv2.INTER_AREA), imgaug.GoogleNetRandomCropAndResize( crop_area_fraction=(0.8, 1.0), aspect_ratio_range=(0.8, 1.2), interp=cv2.INTER_AREA, target_shape=self.hparams.shape), imgaug.ColorSpace(mode=cv2.COLOR_RGB2GRAY), imgaug.ToFloat32(), ] ds_train = AugmentImageComponent(ds_train, ag_train, 0) # Label smoothing ag_label = [ imgaug.BrightnessScale((0.8, 1.2), clip=False), ] # ds_train = AugmentImageComponent(ds_train, ag_label, 1) ds_train = BatchData(ds_train, self.hparams.batch, remainder=True) if self.hparams.debug: ds_train = FixedSizeData(ds_train, 2) ds_train = MultiProcessRunner(ds_train, num_proc=4, num_prefetch=16) ds_train = PrintData(ds_train) ds_train = MapData( ds_train, lambda dp: [ torch.tensor(np.transpose(dp[0], (0, 3, 1, 2))), torch.tensor(dp[1]).float() ]) return ds_train
def preprocess_data_flow(ds, options, is_train, do_multiprocess=False): ds_size = ds.size() while options.batch_size > ds_size: options.batch_size //= 2 ds = BatchData(ds, max(1, options.batch_size // options.nr_gpu), remainder=not is_train) if do_multiprocess: ds = PrefetchData(ds, 5, 5) return ds
def build_iter(self): map_func = partial(self._map_func, is_training=self.training_flag) ds = DataFromGenerator(self.generator) ds = BatchData(ds, self.num_gpu * self.batch_size) ds = MultiProcessPrefetchData(ds, self.prefetch_size, self.process_num) ds.reset_state() ds = ds.get_data() return ds
def get_sequential_loader(ds, isTrain, batch_size, augmentors, parallel=None): """ Load a Single-File LMDB (Sequential Read) Args: augmentors (list[imgaug.Augmentor]): Defaults to `fbresnet_augmentor(isTrain)` Returns: A LMDBData which produces BGR images and labels. See explanations in the tutorial: http://tensorpack.readthedocs.io/tutorial/efficient-dataflow.html """ assert isinstance(augmentors, list) aug = imgaug.AugmentorList(augmentors) if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 2) # assuming hyperthreading if isTrain: ds = LocallyShuffleData(ds, 50000) ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = AugmentImageComponent(ds, aug, copy=False) if parallel < 16: logger.warn( "DataFlow may become the bottleneck when too few processes are used." ) ds = BatchData(ds, batch_size, remainder=False, use_list=True) ds = MultiProcessRunnerZMQ(ds, parallel) else: def mapper(data): im, label = data im = cv2.imdecode(im, cv2.IMREAD_COLOR) im = aug.augment(im) return im, label ds = MultiProcessMapDataZMQ(ds, parallel, mapper, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True, use_list=True) return ds
def get_data(split, option): is_training = split == 'train' parallel = multiprocessing.cpu_count() // 2 ds = get_data_flow(split, is_training, option) augmentors = fbresnet_augmentor(is_training, option) ds = AugmentImageCoordinates(ds, augmentors, coords_index=2, copy=False) if is_training: ds = PrefetchDataZMQ(ds, parallel) ds = BatchData(ds, option.batch_size, remainder=not is_training) return ds