def __init__(self, mode, batch_size=256, shuffle=False, num_workers=25, cache=50000, collate_fn=default_collate, drop_last=False, cuda=False): # enumerate standard imagenet augmentors imagenet_augmentors = fbresnet_augmentor(mode == 'train') # load the lmdb if we can find it lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb' % mode) ds = td.LMDBData(lmdb_loc, shuffle=False) ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.LMDBDataPoint(ds) ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = td.AugmentImageComponent(ds, imagenet_augmentors) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.cuda = cuda
def __init__(self, mode, batch_size=256, shuffle=False, num_workers=25, cache=50000, collate_fn=default_collate, remainder=False, cuda=False, transform=None): # enumerate standard imagenet augmentors #imagenet_augmentors = fbresnet_augmentor(mode == 'train') imagenet_augmentors = [ImgAugTVCompose(transform)] # load the lmdb if we can find it lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb' % mode) ds = td.LMDBData(lmdb_loc, shuffle=False) if mode == 'train': ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.LMDBDataPoint(ds) #ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = td.MapDataComponent( ds, lambda x: np.asarray(Image.open(io.BytesIO(x)).convert('RGB')), 0) ds = td.AugmentImageComponent(ds, imagenet_augmentors) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size, remainder=remainder) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.cuda = cuda
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/train.lmdb", shuffle=False) df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/", shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1) # df = dataflow.PrefetchData(df,nr_prefetch=500, nr_proc=1) df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, num_proc=8) #df = dataflow.PrefetchData(df,num_prefetch=500, num_proc=1) #df = dataflow.PrefetchDataZMQ(df, num_proc=1) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def lmdb_dataflow(lmdb_path, batch_size, sample_size, is_training, test_speed=False, train_perturb_list=None, valid_perturb_list=None, so3_perturb=False, use_partial=False): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = PreprocessData(df, sample_size, is_training, train_perturb_list=train_perturb_list, valid_perturb_list=valid_perturb_list, so3_perturb=so3_perturb, use_partial=use_partial) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def data_pipe(fmri_files, confound_files, label_matrix, target_name=None, batch_size=32, data_type='train', train_percent=0.8, nr_thread=nr_thread, buffer_size=buffer_size): assert data_type in ['train', 'val', 'test'] assert fmri_files is not None print('\n\nGenerating dataflow for %s datasets \n' % data_type) buffer_size = min(len(fmri_files), buffer_size) nr_thread = min(len(fmri_files), nr_thread) ds0 = gen_fmri_file(fmri_files, confound_files, label_matrix, data_type=data_type, train_percent=train_percent) print('dataflowSize is ' + str(ds0.size())) print('Loading data using %d threads with %d buffer_size ... \n' % (nr_thread, buffer_size)) if target_name is None: target_name = np.unique(label_matrix) ####running the model start_time = time.clock() ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image(dp, target_name), buffer_size=buffer_size, strict=True) ds1 = dataflow.PrefetchData(ds1, buffer_size, 1) ds1 = split_samples(ds1) print('prefetch dataflowSize is ' + str(ds1.size())) ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=ds1.size() * buffer_size) ds1 = dataflow.BatchData(ds1, batch_size=batch_size) print('Time Usage of loading data in seconds: {} \n'.format(time.clock() - start_time)) ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1) ds1._reset_once() ##ds1.reset_state() #return ds1.get_data() for df in ds1.get_data(): ##print(np.expand_dims(df[0].astype('float32'),axis=3).shape) yield (np.expand_dims(df[0].astype('float32'), axis=3), to_categorical(df[1].astype('int32'), len(target_name)))
def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", predict_feature=False, batch_size=512, shuffle=False, num_workers=25, cache=10000, drop_last=False, cuda=False, distributed=False, visualization=False, span_mask=False, cond_mask=False, region_len=36): if dist.is_available() and distributed: rank = dist.get_rank() lmdb_file = os.path.join( corpus_path, "training_feat_part_" + str(rank) + ".lmdb") else: lmdb_file = os.path.join(corpus_path, "training_feat_all.lmdb") caption_path = os.path.join(corpus_path, "caption_train.json") print("Loading from %s" % lmdb_file) os.listdir(corpus_path) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) self.cond_mask = cond_mask preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, region_len, self.num_dataset, encoding="utf-8", predict_feature=predict_feature, span_mask=span_mask, cond_mask=cond_mask) # ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) # self.ds = td.PrefetchData(ds, 1) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) # self.ds = ds self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def __init__( self, corpus_path, tokenizer, seq_len, encoding="utf-8", predict_feature=False, hard_negative=False, batch_size=512, shuffle=False, num_workers=25, cache=50000, drop_last=False, cuda=False, distributed=False, visualization=False, ): if dist.is_available() and distributed: num_replicas = dist.get_world_size() # assert num_replicas == 8 rank = dist.get_rank() lmdb_file = "/coc/dataset/conceptual_caption/training_feat_part_" + str(rank) + ".lmdb" # if not os.path.exists(lmdb_file): # lmdb_file = "/srv/share/datasets/conceptual_caption/training_feat_part_" + str(rank) + ".lmdb" else: # lmdb_file = "/coc/dataset/conceptual_caption/training_feat_all.lmdb" # if not os.path.exists(lmdb_file): lmdb_file = "/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/training_feat_all.lmdb" caption_path = "/coc/pskynet2/jlu347/multi-modal-bert/data/conceptual_caption/caption_train.json" print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, 36, self.num_dataset, encoding="utf-8", predict_feature=predict_feature, ) ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) # self.ds = td.PrefetchData(ds, 1) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) # self.ds = ds self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def __init__(self, datafile, batch_size, num_workers=1, nviews=12, reset=True, augment=False, filter_classes=None, filter_views=None, polarmode='cartesian', shuffle=True, filter_ids=None, label_to0idx=False, rgb=False, force_res=0, autocrop=False, keep_aspect_ratio=False): self.filter_classes = filter_classes self.filter_views = filter_views self.filter_ids = filter_ids self.polarmode = polarmode self.label_to0idx = label_to0idx self.rgb = rgb self.force_res = force_res self.autocrop = autocrop self.keep_aspect_ratio = keep_aspect_ratio if not isinstance(datafile, list): datafile = [datafile] ds = [] for d in datafile: ds.append(df.LMDBSerializer.load(d, shuffle=shuffle)) if shuffle: ds[-1] = df.LocallyShuffleData(ds[-1], 100) ds[-1] = df.PrefetchData(ds[-1], 20, 1) ds[-1] = df.MapData(ds[-1], self.load) if augment: ds[-1] = df.MapDataComponent(ds[-1], LMDBMultiView._augment, 0) if (not filter_classes and not filter_ids and num_workers > 1): # warning: skipping this is slower when filtering datasets # but epoch counting will be wrong otherwise ds[-1] = df.PrefetchDataZMQ(ds[-1], num_workers) ds[-1] = df.BatchData(ds[-1], batch_size) if reset: ds[-1].reset_state() self.ds = ds
def __init__( self, annotations_path, features_path, tokenizer, bert_model, seq_len, batch_size=512, num_workers=25, cache=10000, local_rank=-1, objective=0, num_locs=5, add_global_imgfeat=None, ): if dist.is_available() and local_rank != -1: rank = dist.get_rank() lmdb_file = os.path.join( features_path, "training_feat_part_" + str(rank) + ".lmdb") else: lmdb_file = os.path.join(features_path, "training_feat_all.lmdb") print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) ds = td.LocallyShuffleData(ds, cache) caption_path = os.path.join(annotations_path, "caption_train.json") preprocess_function = BertPreprocessBatch( caption_path, tokenizer, bert_model, seq_len, 36, self.num_dataset, objective=objective, num_locs=num_locs, ) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.add_global_imgfeat = add_global_imgfeat self.num_locs = num_locs
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): """load LMDB files, then generate batches??""" df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) # buffer_size df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) # multiprocess the data df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): df = dataflow.LMDBData(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = dataflow.LMDBDataPoint(df) df = PreprocessData(df, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def init_dataflow(ctfstar,batch_size): ''' This function creates dataflow that reads and preprocesses data in parallel ''' augm = df.imgaug.AugmentorList([df.imgaug.MeanVarianceNormalize()]) # create partitioned generators, one for each element in a batch dss0,shape = MicrosGenerator.create_partition(ctfstar,batch_size) # preprocess input dss1 = [df.MapData(ds0, lambda dp: [augm.augment(preprocess_micro(dp[0], dp[1], psize, bn)), np.array(dp[0])]) for ds0 in dss0] # prefetch each generator in a separate process with buffer of 4 images per process # dss1 = [df.PrefetchDataZMQ(ds1, nr_proc=1, hwm=2) for ds1 in dss1] dss1 = [df.PrefetchData(ds1, nr_prefetch=4, nr_proc=1) for ds1 in dss1] # join all dataflows ds1 = df.RandomMixData(dss1) # ds1 = df.JoinData(dss1) ds = df.BatchData(ds1, batch_size) ds.reset_state() return ds,shape
def __init__(self, config, dataset_mode): """Set the path for Data.""" self.data_folder = config.data_folder self.num_input_points = config.num_input_points self.num_gt_points = config.num_gt_points self.dataset_mode = dataset_mode print(self.data_folder + self.dataset_mode + '.lmdb') self.df = dataflow.LMDBSerializer.load(self.data_folder + self.dataset_mode + '.lmdb', shuffle=False) if config.mode == "train": self.df = dataflow.LocallyShuffleData(self.df, buffer_size=2000) self.df = dataflow.PrefetchData(self.df, nr_prefetch=500, nr_proc=1) self.df.reset_state()
def __init__(self, split, batch_size, set_size): if split == 'train': lmdb_path = f'{data_path}/ModelNet40_train_1024_middle.lmdb' else: lmdb_path = f'{data_path}/ModelNet40_test_1024_middle.lmdb' df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) self.size = df.size() self.num_batches = self.size // batch_size if split == 'train': df = dataflow.LocallyShuffleData(df, buffer_size=2000) # buffer_size df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1) df = BatchData(df, batch_size, set_size // 8, set_size - set_size // 8) if split == 'train': df = dataflow.PrefetchDataZMQ(df, num_proc=8) df = dataflow.RepeatedData(df, -1) df.reset_state() self.generator = df.get_data()
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False, filter_rate=0): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) df = dataflow.MapData(df, lambda dp: [item for item in dp] + [random.random()]) size = df.size() print(size) if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
import tensorpack.dataflow as df if __name__ == '__main__': ds = df.dataset.Mnist('train') augmentors = [ df.imgaug.RandomApplyAug( df.imgaug.RandomResize((0.8, 1.2), (0.8, 1.2)), 0.3), df.imgaug.RandomApplyAug(df.imgaug.RotationAndCropValid(15), 0.5), df.imgaug.RandomApplyAug( df.imgaug.SaltPepperNoise(white_prob=0.01, black_prob=0.01), 0.25), df.imgaug.Resize((28, 28)), df.imgaug.CenterPaste((32, 32)), df.imgaug.RandomCrop((28, 28)), df.imgaug.MapImage(lambda x: x.reshape(28, 28, 1)) ] ds = df.AugmentImageComponent(ds, augmentors) ds = df.BatchData(ds, batch_size=32, remainder=False) ds = df.PrefetchData(ds, nr_prefetch=12, nr_proc=2) ds = df.PrintData(ds) df.send_dataflow_zmq(ds, 'tcp://localhost:2222')
def __init__( self, corpus_path, tokenizer, bert_model, seq_len, encoding="utf-8", visual_target=0, hard_negative=False, batch_size=512, shuffle=False, num_workers=25, cache=10000, drop_last=False, cuda=False, local_rank=-1, objective=0, visualization=False, ): TRAIN_DATASET_SIZE = 3119449 if dist.is_available() and local_rank != -1: num_replicas = dist.get_world_size() rank = dist.get_rank() lmdb_file = os.path.join( corpus_path, "training_feat_part_" + str(rank) + ".lmdb") else: lmdb_file = os.path.join(corpus_path, "gqa_resnext152_faster_rcnn_genome.lmdb") # lmdb_file = os.path.join(corpus_path, "validation_feat_all.lmdb") print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) ds = td.LocallyShuffleData(ds, cache) caption_path = os.path.join(corpus_path, "caption_train.json") # caption_path = os.path.join(corpus_path, "caption_val.json") preprocess_function = BertPreprocessBatch( caption_path, tokenizer, bert_model, seq_len, 36, self.num_dataset, encoding="utf-8", visual_target=visual_target, objective=objective, ) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) # self.ds = td.PrefetchData(ds, 1) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) # self.ds = ds self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def data_pipe_3dcnn_block(fmri_files, confound_files, label_matrix, target_name=None, flag_cnn='3d', block_dura=1, hrf_delay=0, batch_size=32, data_type='train', nr_thread=4, buffer_size=10, dataselect_percent=1.0, seed=814, verbose=0): assert data_type in ['train', 'val', 'test'] assert flag_cnn in ['3d', '2d'] assert fmri_files is not None isTrain = data_type == 'train' isVal = data_type == 'val' isTest = data_type == 'test' buffer_size = int(min(len(fmri_files), buffer_size)) nr_thread = int(min(len(fmri_files), nr_thread)) ds0 = gen_fmri_file(fmri_files, confound_files, label_matrix, data_type=data_type, seed=seed) if target_name is None: target_name = np.unique(label_matrix) ##Subject_Num, Trial_Num = np.array(label_matrix).shape ####running the model start_time = time.clock() if flag_cnn == '2d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_block( dp, target_name, block_dura=block_dura, hrf_delay=hrf_delay), buffer_size=buffer_size, strict=True) elif flag_cnn == '3d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_3d_block( dp, target_name, block_dura=block_dura, hrf_delay=hrf_delay), buffer_size=buffer_size, strict=True) ds1 = dataflow.PrefetchData(ds1, buffer_size, 1) ##1 ds1 = split_samples(ds1, subject_num=len(fmri_files), batch_size=batch_size, dataselect_percent=dataselect_percent) dataflowSize = ds1.size() if isTrain: if verbose: print('%d #Trials/Samples per subject with %d channels in tc' % (ds1.Trial_Num, ds1.Block_dura)) Trial_Num = ds1.Trial_Num ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=Trial_Num * buffer_size, shuffle_interval=Trial_Num * buffer_size // 2) #//2 ds1 = dataflow.BatchData(ds1, batch_size=batch_size) if verbose: print('\n\nGenerating dataflow for %s datasets \n' % data_type) print('dataflowSize is ' + str(ds0.size())) print('Loading data using %d threads with %d buffer_size ... \n' % (nr_thread, buffer_size)) print('prefetch dataflowSize is ' + str(dataflowSize)) print('Time Usage of loading data in seconds: {} \n'.format( time.clock() - start_time)) if isTrain: ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=nr_thread) ##1 else: ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1) ##1 ##ds1._reset_once() ds1.reset_state() for df in ds1.get_data(): yield (df[0].astype('float32'), one_hot(df[1], len(target_name) + 1).astype('uint8')) ###end of tensorpack: multithread ##############################################################
def data_pipe_3dcnn_block(fmri_files, confound_files, label_matrix, target_name=None, flag_cnn='3d', block_dura=1, batch_size=32, data_type='train', nr_thread=nr_thread, buffer_size=buffer_size): assert data_type in ['train', 'val', 'test'] assert flag_cnn in ['3d', '2d'] assert fmri_files is not None isTrain = data_type == 'train' isVal = data_type == 'val' print('\n\nGenerating dataflow for %s datasets \n' % data_type) buffer_size = int(min(len(fmri_files), buffer_size)) nr_thread = int(min(len(fmri_files), nr_thread)) ds0 = gen_fmri_file(fmri_files, confound_files, label_matrix, data_type=data_type) print('dataflowSize is ' + str(ds0.size())) print('Loading data using %d threads with %d buffer_size ... \n' % (nr_thread, buffer_size)) if target_name is None: target_name = np.unique(label_matrix) ##Subject_Num, Trial_Num = np.array(label_matrix).shape ####running the model start_time = time.clock() if flag_cnn == '2d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_block( dp, target_name, block_dura=block_dura), buffer_size=buffer_size, strict=True) elif flag_cnn == '3d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_3d_block( dp, target_name, block_dura=block_dura), buffer_size=buffer_size, strict=True) ds1 = dataflow.PrefetchData(ds1, buffer_size, 1) ds1 = split_samples(ds1) print('prefetch dataflowSize is ' + str(ds1.size())) if isTrain: print('%d #Trials/Samples per subject with %d channels in tc' % (ds1.Trial_Num, ds1.Block_dura)) Trial_Num = ds1.Trial_Num #ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=ds1.size() * buffer_size) ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=Trial_Num * buffer_size, shuffle_interval=Trial_Num * buffer_size) #//2 ds1 = dataflow.BatchData(ds1, batch_size=batch_size, remainder=True) print('Time Usage of loading data in seconds: {} \n'.format(time.clock() - start_time)) ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1) #ds1._reset_once() ##ds1.reset_state() ''' for df in ds1.get_data(): if flag_cnn == '2d': yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name))) elif flag_cnn == '3d': yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name))) ''' return ds1