def get_dataflow(path, is_train, img_path=None): ds = CocoPose(path, img_path, is_train) # read data from lmdb if is_train: ds = MapData(ds, read_image_url) ds = MapDataComponent(ds, pose_random_scale) ds = MapDataComponent(ds, pose_rotation) ds = MapDataComponent(ds, pose_flip) ds = MapDataComponent(ds, pose_resize_shortestedge_random) ds = MapDataComponent(ds, pose_crop_random) ds = MapData(ds, pose_to_img) # augs = [ # imgaug.RandomApplyAug(imgaug.RandomChooseAug([ # imgaug.GaussianBlur(max_size=3) # ]), 0.7) # ] # ds = AugmentImageComponent(ds, augs) ds = PrefetchData(ds, 1000, multiprocessing.cpu_count() * 4) else: ds = MultiThreadMapData(ds, nr_thread=16, map_func=read_image_url, buffer_size=1000) ds = MapDataComponent(ds, pose_resize_shortestedge_fixed) ds = MapDataComponent(ds, pose_crop_center) ds = MapData(ds, pose_to_img) ds = PrefetchData(ds, 100, multiprocessing.cpu_count() // 4) return ds
def get_dataflow_batch(path, is_train, batchsize, img_path=None): logger.info('dataflow img_path=%s' % img_path) ds = get_dataflow(path, is_train, img_path=img_path) ds = BatchData(ds, batchsize) if is_train: ds = PrefetchData(ds, 10, 2) else: ds = PrefetchData(ds, 50, 2) return ds
def get_imagenet_dataflow(datadir, name, batch_size, augmentors, parallel=None): """ See explanations in the tutorial: http://tensorpack.readthedocs.io/en/latest/tutorial/efficient-dataflow.html """ assert name in ['train', 'val', 'test'] assert datadir is not None assert isinstance(augmentors, list) isTrain = name == 'train' if parallel is None: parallel = min(40, multiprocessing.cpu_count() // 6) if isTrain: ds = dataset.ILSVRC12(datadir, name, shuffle=True) ds = AugmentImageComponent(ds, augmentors, copy=False) if parallel < 16: logger.warning( "DataFlow may become the bottleneck when too few processes are used." ) ds = PrefetchData(ds, 1000, parallel) ds = BatchData(ds, batch_size, remainder=False) else: ds = dataset.ILSVRC12Files(datadir, name, shuffle=False) aug = imgaug.AugmentorList(augmentors) def mapf(dp): fname, cls = dp im = np.zeros((256, 256, 3), dtype=np.uint8) for _ in range(30): try: im = cv2.imread(fname, cv2.IMREAD_COLOR) im = aug.augment(im) break except Exception as e: logger.warning(str(e), 'file=', fname) time.sleep(1) return im, cls ds = MultiThreadMapData(ds, parallel, mapf, buffer_size=2000, strict=True) ds = BatchData(ds, batch_size, remainder=True) ds = PrefetchData(ds, 100, 1) return ds
def get_test_valid_split_labels(): ds_train = CellImageDataManagerTrainAll(master_dir_train) ds_train = MapDataComponent(ds_train, random_crop_224) ds_train = PrefetchData(ds_train, 1000, 12) ds_train_img = ds_train.get_data() features = [] train_lists = [] valid_lists = [] for idx, dp in tqdm(enumerate(ds_train_img)): img = np.asarray(dp[0].image(is_gray=False)) img = np.ndarray.flatten(img) features.append(img) features = np.stack(features, axis=0) labels = cluster_features(features, n_clusters=n_clusters) idx_labels = [] for i in range(n_clusters): idx_labels.append(np.transpose(np.argwhere((labels == i)))) # Split train and valid data set for n in range(n_clusters): train_lists.extend(idx_labels[n][:, :int(idx_labels[n].shape[1] * ratio)]) valid_lists.extend(idx_labels[n][:, int(idx_labels[n].shape[1] * ratio):]) for n in range(n_clusters): np.random.shuffle(train_lists[n]) np.random.shuffle(valid_lists[n]) return train_lists, valid_lists
def get_dataflow(path, is_train): ds = CocoPoseLMDB(path, is_train) # read data from lmdb if is_train: ds = MapDataComponent(ds, pose_random_scale) ds = MapDataComponent(ds, pose_rotation) ds = MapDataComponent(ds, pose_flip) ds = MapDataComponent(ds, pose_resize_shortestedge_random) ds = MapDataComponent(ds, pose_crop_random) ds = MapData(ds, pose_to_img) augs = [ imgaug.RandomApplyAug( imgaug.RandomChooseAug([ imgaug.BrightnessScale((0.6, 1.4), clip=False), imgaug.Contrast((0.7, 1.4), clip=False), imgaug.GaussianBlur(max_size=3) ]), 0.7), ] ds = AugmentImageComponent(ds, augs) else: ds = MapDataComponent(ds, pose_resize_shortestedge_fixed) ds = MapDataComponent(ds, pose_crop_center) ds = MapData(ds, pose_to_img) ds = PrefetchData(ds, 1000, multiprocessing.cpu_count()) return ds
def get_downsampled_imagenet_augmented_data(subset, options, do_multiprocess=True, do_validation=False, shuffle=None): isTrain = subset == 'train' and do_multiprocess shuffle = shuffle if shuffle is not None else isTrain reret = re.search(r'^imagenet([0-9]*)$', options.ds_name) input_size = int(reret.group(1)) ds = DownsampledImageNet(_data_batch_dir(options.data_dir, input_size),\ subset, shuffle, input_size, do_validation=do_validation) pp_mean = ds.mean_img paste_size = ds.input_size * 5 // 4 crop_size = ds.input_size if isTrain: augmentors = [ imgaug.CenterPaste((paste_size, paste_size)), imgaug.RandomCrop((crop_size, crop_size)), imgaug.Flip(horiz=True), imgaug.MapImage(lambda x: (x - pp_mean)/128.0), ] else: augmentors = [ imgaug.MapImage(lambda x: (x - pp_mean)/128.0) ] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) if do_multiprocess: ds = PrefetchData(ds, 4, 2) return ds
def generate_dataflow(dataset, option): if option['number_of_cores'] == -1: option['number_of_cores'] = mp.cpu_count() ds = DataFlow(dataset, option) ds = AugmentImageComponent(ds, option['augmentors'], copy = False) if option['number_of_cores'] < 16: print('[!} Warning = DataFlow may become the bottleneck when too few processes are used.') ds = PrefetchData(ds, option['num_prefetch_for_dataset'], option['number_of_cores']) ds = BatchData(ds, option['batch_size'], remainder = option['remainder']) ds = PrefetchData(ds, option['num_prefetch_for_batch'], 2) return ds
def get_remote_dataflow(port, nr_prefetch=1000, nr_thread=1): ipc = 'ipc:///tmp/ipc-socket' tcp = 'tcp://0.0.0.0:%d' % port data_loader = RemoteDataZMQ(ipc, tcp, hwm=10000) data_loader = BatchData(data_loader, batch_size=hp.train.batch_size) data_loader = PrefetchData(data_loader, nr_prefetch, nr_thread) return data_loader
def get_default_dataflow_batch(batchsize=32): ds = get_default_dataflow() ds = MapData(ds, data_to_segment_input) ds = BatchData(ds, batchsize) ds = MapDataComponent(ds, data_to_normalize01) ds = PrefetchData(ds, 10, 2) return ds
def preprocess_data_flow(ds, options, is_train, do_multiprocess=False): ds_size = ds.size() while options.batch_size > ds_size: options.batch_size //= 2 ds = BatchData(ds, max(1, options.batch_size // options.nr_gpu), remainder=not is_train) if do_multiprocess: ds = PrefetchData(ds, 5, 5) return ds
def get_train_dataflow(roidb): """ Tensorpack text dataflow. """ ds = DataFromList(roidb, shuffle=True) preprocess = TextDataPreprocessor(cfg) buffer_size = cfg.num_threads * 10 ds = MultiThreadMapData(ds, cfg.num_threads, preprocess, buffer_size=buffer_size) # ds = MultiProcessMapData(ds, cfg.num_workers, preprocess, buffer_size=buffer_size) ds = PrefetchData(ds, 100, multiprocessing.cpu_count() // 4) #ds = BatchData(ds, cfg.batch_size, remainder=True) return ds
def get_cifar_augmented_data(subset, options, do_multiprocess=True, do_validation=False, shuffle=None): isTrain = subset == 'train' and do_multiprocess shuffle = shuffle if shuffle is not None else isTrain if options.num_classes == 10 and options.ds_name == 'cifar10': ds = dataset.Cifar10(subset, shuffle=shuffle, do_validation=do_validation) cutout_length = 16 n_holes = 1 elif options.num_classes == 100 and options.ds_name == 'cifar100': ds = dataset.Cifar100(subset, shuffle=shuffle, do_validation=do_validation) cutout_length = 8 n_holes = 1 else: raise ValueError( 'Number of classes must be set to 10(default) or 100 for CIFAR') logger.info('{} set has n_samples: {}'.format(subset, len(ds.data))) pp_mean = ds.get_per_pixel_mean() if isTrain: logger.info('Will do cut-out with length={} n_holes={}'.format( cutout_length, n_holes)) augmentors = [ imgaug.CenterPaste((40, 40)), imgaug.RandomCrop((32, 32)), imgaug.Flip(horiz=True), imgaug.MapImage(lambda x: (x - pp_mean) / 128.0), Cutout(length=cutout_length, n_holes=n_holes), ] else: augmentors = [imgaug.MapImage(lambda x: (x - pp_mean) / 128.0)] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) if do_multiprocess: ds = PrefetchData(ds, 3, 2) return ds
def get_data(self, train_or_test): isTrain = train_or_test == 'train' ds = dataset.Cifar10(train_or_test, dir='.') pp_mean = ds.get_per_pixel_mean() if isTrain: augmentors = [ imgaug.CenterPaste((40, 40)), imgaug.RandomCrop((32, 32)), imgaug.Flip(horiz=True), # imgaug.Brightness(20), # imgaug.Contrast((0.6,1.4)), imgaug.MapImage(lambda x: x - pp_mean), ] else: augmentors = [imgaug.MapImage(lambda x: x - pp_mean)] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, self.batch_size, remainder=not isTrain) if isTrain: ds = PrefetchData(ds, 3, 2) return ds
def get_augmented_speech_commands_data(subset, options, do_multiprocess=True, shuffle=True): isTrain = subset == 'train' and do_multiprocess shuffle = shuffle if shuffle is not None else isTrain ds = SpeechCommandsDataFlow( os.path.join(options.data_dir, 'speech_commands_v0.02'), subset, shuffle, None) if isTrain: add_noise_func = functools.partial(_add_noise, noises=ds.noises) ds = MapDataComponent(ds, _pad_or_clip_to_desired_sample, index=0) ds = MapDataComponent(ds, _to_float, index=0) if isTrain: ds = MapDataComponent(ds, _time_shift, index=0) ds = MapData(ds, add_noise_func) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) if do_multiprocess: ds = PrefetchData(ds, 4, 4) return ds
def __call__(self, n_prefetch=1000, n_thread=1): df = self df = BatchData(df, self.batch_size) df = PrefetchData(df, n_prefetch, n_thread) return df
def __call__(self, n_prefetch=1, n_thread=1): df = self df = BatchData(df, 1) df = PrefetchData(df, n_prefetch, n_thread) return df
def dataflow(self, nr_prefetch=1000, nr_thread=1): ds = self ds = BatchData(ds, self.batch_size) ds = PrefetchData(ds, nr_prefetch, nr_thread) return ds
def get_inat_augmented_data(subset, options, lmdb_dir=None, year='2018', do_multiprocess=True, do_validation=False, is_train=None, shuffle=None, n_allow=None): input_size = options.input_size if options.input_size else 224 isTrain = is_train if is_train is not None else (subset == 'train' and do_multiprocess) shuffle = shuffle if shuffle is not None else isTrain postfix = "" if n_allow is None else "_allow_{}".format(n_allow) #TODO: Parameterize the cv split to be consider #Currently hardcoding to 1 cv = 1 # When do_validation is True it will expect *cv_train and *cv_val lmdbs # Currently the cv_train split is always used if isTrain: postfix += '_cv_train_{}'.format(cv) elif do_validation: subset = 'train' postfix += '_cv_val_{}'.format(cv) if lmdb_dir == None: lmdb_path = os.path.join(options.data_dir, 'inat_lmdb', 'inat2018_{}{}.lmdb'.format(subset, postfix)) else: lmdb_path = os.path.join( options.data_dir, lmdb_dir, 'inat{}_{}{}.lmdb'.format(year, subset, postfix)) ds = LMDBData(lmdb_path, shuffle=False) if shuffle: ds = LocallyShuffleData(ds, 1024 * 80) # This is 64G~80G in memory images ds = PrefetchData(ds, 1024 * 8, 1) # prefetch around 8 G ds = LMDBDataPoint(ds) ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) # BGR uint8 data if isTrain: class Resize(imgaug.ImageAugmentor): """ crop 8%~100% of the original image See `Going Deeper with Convolutions` by Google. """ def _augment(self, img, _): h, w = img.shape[:2] area = h * w for _ in range(10): targetArea = self.rng.uniform(0.08, 1.0) * area aspectR = self.rng.uniform(0.75, 1.333) ww = int(np.sqrt(targetArea * aspectR)) hh = int(np.sqrt(targetArea / aspectR)) if self.rng.uniform() < 0.5: ww, hh = hh, ww if hh <= h and ww <= w: x1 = 0 if w == ww else self.rng.randint(0, w - ww) y1 = 0 if h == hh else self.rng.randint(0, h - hh) out = img[y1:y1 + hh, x1:x1 + ww] out = cv2.resize(out, (input_size, input_size), interpolation=cv2.INTER_CUBIC) return out out = cv2.resize(img, (input_size, input_size), interpolation=cv2.INTER_CUBIC) return out augmentors = [ Resize(), imgaug.RandomOrderAug([ imgaug.Brightness(30, clip=False), imgaug.Contrast((0.8, 1.2), clip=False), imgaug.Saturation(0.4), # rgb-bgr conversion imgaug.Lighting(0.1, eigval=[0.2175, 0.0188, 0.0045][::-1], eigvec=np.array([[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]], dtype='float32')[::-1, ::-1]) ]), imgaug.Clip(), imgaug.Flip(horiz=True), imgaug.ToUint8() ] else: augmentors = [ imgaug.ResizeShortestEdge(256), imgaug.CenterCrop((input_size, input_size)), imgaug.ToUint8() ] ds = AugmentImageComponent(ds, augmentors, copy=False) if do_multiprocess: ds = PrefetchDataZMQ(ds, min(24, multiprocessing.cpu_count())) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) return ds
print(model) model.compile(optimizer=Adam(lr=1e-4), loss=average_dice_coef_loss, metrics=[average_dice_coef]) df = MyDataFlow(train_data_dir, FLAGS.image_filename, FLAGS.label_filename, shuffle=True) df = MapDataComponent(df, process.simple_preprocess_img, index=0) df = MapDataComponent(df, process.sample_z_norm, index=0) df = MapDataComponent(df, process.simple_preprocess_mask, index=1) df = MapData(df, process.resize_whole) df = MapData(df, process.data_aug) df = PrefetchData(df, 2, 1) gen_train = gen_data(df) cb_early_stopping = EarlyStopping(monitor='loss', patience=100) cbs = list() cbs.append( ModelCheckpoint('{}/checkpoint_{}.h5'.format(FLAGS.checkpoint_dir, FLAGS.chd_hcmp), save_best_only=True, monitor='loss', period=1)) cbs.append( CSVLogger('{}/checkpoint.log'.format(FLAGS.checkpoint_dir), append=True)) #cbs.append(ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)) cbs.append(
def get_dataflow_batch(path, is_train, batchsize): ds = get_dataflow(path, is_train) ds = BatchData(ds, batchsize) ds = PrefetchData(ds, 10, 2) return ds
validataion_filenames = photo_filenames[_NUM_VALIDATION:] class_names_to_ids = dict(zip(class_names, range(len(class_names)))) train_dataset = my_dataset_flow(training_filenames, 'train', class_names_to_ids) ds = AugmentImageComponent(train_dataset, [imgaug.Resize((299, 299))]) #ds = PrefetchData(ds, 1000, multiprocessing.cpu_count()) '''중요한 점은, 데이터를 읽는 부분이나 rotation, flip, crop 등의 augmentation을 정의하고 이를 PrefetchData에 넘기면 필요한 부분을 여러 프로세스로 띄워서 처리해준다는 점입니다.''' batchsize = 256 ds = BatchData(ds, batchsize, use_list=True) nr_prefetch = 10 nr_proc = 2 ds = PrefetchData(ds, nr_prefetch, nr_proc) TestDataSpeed(ds).start() j = 0 for i in ds.get_data(): print(np.array(i[0]).shape) print(np.array(i[1]).shape) placeholder = [ tf.placeholder(dtype=tf.uint8, shape=(None, 299, 299, 3)), tf.placeholder(dtype=tf.uint8, shape=(None)) ] queue = tf.FIFOQueue(512, [x.dtype for x in placeholder]) thread = EnqueueThread(queue, ds, placeholder) numberOfThreads = 1 qr = tf.train.QueueRunner(queue, [thread] * numberOfThreads)
def get_default_dataflow(): ds = CellImageDataManagerTrain() ds = PrefetchData(ds, 1000, 12) return ds
def get_tiny_imagenet_augmented_data(subset, options, do_multiprocess=True, is_train=None, shuffle=None): isTrain = is_train if is_train is not None else (subset == 'train' and do_multiprocess) shuffle = shuffle if shuffle is not None else isTrain lmdb_path = os.path.join(options.data_dir, 'tiny_imagenet_lmdb', 'tiny_imagenet_{}.lmdb'.format(subset)) # since tiny imagenet is small (200MB zipped) we can shuffle all directly. # we skipped the LocallyShuffleData and PrefetchData routine. ds = LMDBData(lmdb_path, shuffle=shuffle) ds = LMDBDataPoint(ds) ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) img_size = 64 if isTrain: class Resize(imgaug.ImageAugmentor): """ crop 8%~100% of the original image See `Going Deeper with Convolutions` by Google. """ def _augment(self, img, _): h, w = img.shape[:2] area = h * w for _ in range(10): targetArea = self.rng.uniform(0.3, 1.0) * area aspectR = self.rng.uniform(0.75, 1.333) ww = int(np.sqrt(targetArea * aspectR)) hh = int(np.sqrt(targetArea / aspectR)) if self.rng.uniform() < 0.5: ww, hh = hh, ww if hh <= h and ww <= w: x1 = 0 if w == ww else self.rng.randint(0, w - ww) y1 = 0 if h == hh else self.rng.randint(0, h - hh) out = img[y1:y1 + hh, x1:x1 + ww] out = cv2.resize(out, (img_size, img_size), interpolation=cv2.INTER_CUBIC) return out out = cv2.resize(img, (img_size, img_size), interpolation=cv2.INTER_CUBIC) return out augmentors = [ Resize(), imgaug.RandomOrderAug( [imgaug.Brightness(30, clip=False), imgaug.Contrast((0.8, 1.2), clip=False), imgaug.Saturation(0.4), # rgb-bgr conversion imgaug.Lighting(0.1, eigval=[0.2175, 0.0188, 0.0045][::-1], eigvec=np.array( [[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]], dtype='float32')[::-1, ::-1] )]), imgaug.Clip(), imgaug.Flip(horiz=True), imgaug.ToUint8() ] else: augmentors = [ imgaug.ResizeShortestEdge(72), imgaug.CenterCrop((img_size, img_size)), imgaug.ToUint8() ] ds = AugmentImageComponent(ds, augmentors, copy=False) ds = BatchData(ds, options.batch_size // options.nr_gpu, remainder=not isTrain) if do_multiprocess: ds = PrefetchData(ds, nr_prefetch=4, nr_proc=4) return ds
print(model) model.compile(optimizer=Adam(lr=1e-4), loss=average_dice_coef_loss, metrics=[average_dice_coef]) df = MyDataFlow(train_data_dir, FLAGS.image_filename, FLAGS.label_filename, shuffle=True) df = MapDataComponent(df, process.simple_preprocess_img, index=0) df = MapDataComponent(df, process.sample_z_norm, index=0) df = MapDataComponent(df, process.simple_preprocess_mask, index=1) df = MapData(df, process.resize_whole) df = MapData(df, process.data_aug) df = PrefetchData(df, 8, 4) gen_train = gen_data(df) cbs = list() cbs.append( ModelCheckpoint('{}/checkpoint_{}_{}.h5'.format(FLAGS.checkpoint_dir, FLAGS.chd_hcmp, FLAGS.task_detail), save_best_only=True, monitor='loss', period=1)) cbs.append( CSVLogger('{}/checkpoint.log'.format(FLAGS.checkpoint_dir), append=True)) #cbs.append(ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)) #cbs.append(TensorBoard(log_dir='{}'.format(FLAGS.checkpoint_dir), histogram_freq=0,