def dataflow(centroids, num_reference=3, num_process=16, shuffle=True): ds = Kinetics('/data/public/rw/datasets/videos/kinetics', num_frames=num_reference + 1, skips=[0, 4, 4, 8][:num_reference + 1], shuffle=shuffle) ds = df.MapDataComponent(ds, ImageProcess.resize(small_axis=256), index=1) ds = df.MapDataComponent(ds, ImageProcess.crop(shape=(256, 256)), index=1) #ds = df.MapDataComponent(ds, lambda images: [cv2.resize(image, (256, 256)) for image in images], index=1) ds = df.MapData( ds, lambda dp: [ dp[1][:num_reference], copy.deepcopy(dp[1][:num_reference]), dp[1][num_reference:], copy.deepcopy(dp[1][num_reference:]) ]) # for images (ref, target) for idx in [0, 2]: ds = df.MapDataComponent( ds, lambda images: [ cv2.cvtColor(image, cv2.COLOR_BGR2GRAY).reshape(256, 256, 1) for image in images ], index=idx) # for labels (ref, target) for idx in [1, 3]: ds = df.MapDataComponent( ds, lambda images: [cv2.resize(image, (32, 32)) for image in images], index=idx) ds = df.MapDataComponent(ds, lambda images: [ cv2.cvtColor(np.float32(image / 255.0), cv2.COLOR_BGR2Lab)[:, :, 1:] for image in images ], index=idx) ds = df.MapDataComponent( ds, lambda images: [ np.array([ np.argmin(np.linalg.norm(centroids - v, axis=1)) for v in image.reshape((-1, 2)) ]).reshape((32, 32, 1)) for image in images ], index=idx) # stack for tensor ds = df.MapData( ds, lambda dp: [np.stack(dp[0] + dp[2], axis=0), np.stack(dp[1] + dp[3], axis=0)]) ds = df.MapData(ds, tuple) # for tensorflow.data.dataset ds = df.MultiProcessPrefetchData(ds, nr_prefetch=256, nr_proc=num_process) ds = df.PrefetchDataZMQ(ds, nr_proc=1) return ds
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/train.lmdb", shuffle=False) df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) #df = dataflow.LMDBSerializer.load("/home/cuda/Alex/PC-NBV/data/", shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1) # df = dataflow.PrefetchData(df,nr_prefetch=500, nr_proc=1) df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, num_proc=8) #df = dataflow.PrefetchData(df,num_prefetch=500, num_proc=1) #df = dataflow.PrefetchDataZMQ(df, num_proc=1) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def __init__(self, mode, batch_size=256, shuffle=False, num_workers=25, cache=50000, collate_fn=default_collate, drop_last=False, cuda=False): # enumerate standard imagenet augmentors imagenet_augmentors = fbresnet_augmentor(mode == 'train') # load the lmdb if we can find it lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb' % mode) ds = td.LMDBData(lmdb_loc, shuffle=False) ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.LMDBDataPoint(ds) ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = td.AugmentImageComponent(ds, imagenet_augmentors) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.cuda = cuda
def __init__(self, mode, batch_size=256, shuffle=False, num_workers=25, cache=50000, collate_fn=default_collate, remainder=False, cuda=False, transform=None): # enumerate standard imagenet augmentors #imagenet_augmentors = fbresnet_augmentor(mode == 'train') imagenet_augmentors = [ImgAugTVCompose(transform)] # load the lmdb if we can find it lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb' % mode) ds = td.LMDBData(lmdb_loc, shuffle=False) if mode == 'train': ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.LMDBDataPoint(ds) #ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0) ds = td.MapDataComponent( ds, lambda x: np.asarray(Image.open(io.BytesIO(x)).convert('RGB')), 0) ds = td.AugmentImageComponent(ds, imagenet_augmentors) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size, remainder=remainder) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.cuda = cuda
def lmdb_dataflow(lmdb_path, batch_size, sample_size, is_training, test_speed=False, train_perturb_list=None, valid_perturb_list=None, so3_perturb=False, use_partial=False): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = PreprocessData(df, sample_size, is_training, train_perturb_list=train_perturb_list, valid_perturb_list=valid_perturb_list, so3_perturb=so3_perturb, use_partial=use_partial) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def data_pipe(fmri_files, confound_files, label_matrix, target_name=None, batch_size=32, data_type='train', train_percent=0.8, nr_thread=nr_thread, buffer_size=buffer_size): assert data_type in ['train', 'val', 'test'] assert fmri_files is not None print('\n\nGenerating dataflow for %s datasets \n' % data_type) buffer_size = min(len(fmri_files), buffer_size) nr_thread = min(len(fmri_files), nr_thread) ds0 = gen_fmri_file(fmri_files, confound_files, label_matrix, data_type=data_type, train_percent=train_percent) print('dataflowSize is ' + str(ds0.size())) print('Loading data using %d threads with %d buffer_size ... \n' % (nr_thread, buffer_size)) if target_name is None: target_name = np.unique(label_matrix) ####running the model start_time = time.clock() ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image(dp, target_name), buffer_size=buffer_size, strict=True) ds1 = dataflow.PrefetchData(ds1, buffer_size, 1) ds1 = split_samples(ds1) print('prefetch dataflowSize is ' + str(ds1.size())) ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=ds1.size() * buffer_size) ds1 = dataflow.BatchData(ds1, batch_size=batch_size) print('Time Usage of loading data in seconds: {} \n'.format(time.clock() - start_time)) ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1) ds1._reset_once() ##ds1.reset_state() #return ds1.get_data() for df in ds1.get_data(): ##print(np.expand_dims(df[0].astype('float32'),axis=3).shape) yield (np.expand_dims(df[0].astype('float32'), axis=3), to_categorical(df[1].astype('int32'), len(target_name)))
def __init__( self, corpus_path, tokenizer, seq_len, encoding='utf-8', predict_feature=False, hard_negative=False, batch_size=512, shuffle=False, num_workers=25, cache=50000, drop_last=False, cuda=False, distributed=False, visualization=False, ): if dist.is_available() and distributed: # num_replicas = dist.get_world_size() # assert num_replicas == 8 rank = dist.get_rank() lmdb_file = '/mnt3/xuesheng/features_lmdb/CC/training_feat_part_' + str( rank) + '.lmdb' # if not os.path.exists(lmdb_file): # lmdb_file = "/srv/share/datasets/conceptual_caption/training_feat_part_" + str(rank) + ".lmdb" else: # lmdb_file = "/coc/dataset/conceptual_caption/training_feat_all.lmdb" # if not os.path.exists(lmdb_file): lmdb_file = '/mnt3/xuesheng/features_lmdb/CC/training_feat_part_0.lmdb' caption_path = '/mnt3/xuesheng/features_lmdb/CC/caption_train.json' print('Loading from %s' % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=True) self.num_dataset = len(ds) preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, 36, self.num_dataset, encoding='utf-8', predict_feature=predict_feature, ) # ds = td.LocallyShuffleData(ds, cache) # ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) # self.ds = td.PrefetchData(ds, 1) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) # self.ds = ds self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", predict_feature=False, batch_size=512, shuffle=False, num_workers=25, cache=10000, drop_last=False, cuda=False, distributed=False, visualization=False, span_mask=False, cond_mask=False, region_len=36): if dist.is_available() and distributed: rank = dist.get_rank() lmdb_file = os.path.join( corpus_path, "training_feat_part_" + str(rank) + ".lmdb") else: lmdb_file = os.path.join(corpus_path, "training_feat_all.lmdb") caption_path = os.path.join(corpus_path, "caption_train.json") print("Loading from %s" % lmdb_file) os.listdir(corpus_path) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) self.cond_mask = cond_mask preprocess_function = BertPreprocessBatch( caption_path, tokenizer, seq_len, region_len, self.num_dataset, encoding="utf-8", predict_feature=predict_feature, span_mask=span_mask, cond_mask=cond_mask) # ds = td.LocallyShuffleData(ds, cache) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) # self.ds = td.PrefetchData(ds, 1) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) # self.ds = ds self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def __init__(self, datafile, batch_size, num_workers=1, nviews=12, reset=True, augment=False, filter_classes=None, filter_views=None, polarmode='cartesian', shuffle=True, filter_ids=None, label_to0idx=False, rgb=False, force_res=0, autocrop=False, keep_aspect_ratio=False): self.filter_classes = filter_classes self.filter_views = filter_views self.filter_ids = filter_ids self.polarmode = polarmode self.label_to0idx = label_to0idx self.rgb = rgb self.force_res = force_res self.autocrop = autocrop self.keep_aspect_ratio = keep_aspect_ratio if not isinstance(datafile, list): datafile = [datafile] ds = [] for d in datafile: ds.append(df.LMDBSerializer.load(d, shuffle=shuffle)) if shuffle: ds[-1] = df.LocallyShuffleData(ds[-1], 100) ds[-1] = df.PrefetchData(ds[-1], 20, 1) ds[-1] = df.MapData(ds[-1], self.load) if augment: ds[-1] = df.MapDataComponent(ds[-1], LMDBMultiView._augment, 0) if (not filter_classes and not filter_ids and num_workers > 1): # warning: skipping this is slower when filtering datasets # but epoch counting will be wrong otherwise ds[-1] = df.PrefetchDataZMQ(ds[-1], num_workers) ds[-1] = df.BatchData(ds[-1], batch_size) if reset: ds[-1].reset_state() self.ds = ds
def __init__(self, num_split): lmdb_file = "/srv/share/vgoswami8/conceptual_captions/training_feat_all.lmdb" caption_path = "/srv/share/vgoswami8/conceptual_captions/caption_train.json" print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = int(len(ds) / num_split) + 1 ds = td.PrefetchDataZMQ(ds, nr_proc=1) ds = td.FixedSizeData(ds, self.num_dataset, keep_state=True) self.ds = ds self.ds.reset_state()
def get_dataflows(config): """ construct and initialize dataflows based on config. """ df = ExpertDataflow(config) df = tp_dataflow.PrefetchDataZMQ(df, nr_proc=16) df = tp_dataflow.BatchData(df, config['batch_size'], remainder=False) # initialize random number generator in child processes to unique values df.reset_state() return df
def __init__( self, annotations_path, features_path, tokenizer, bert_model, seq_len, batch_size=512, num_workers=25, cache=10000, local_rank=-1, objective=0, num_locs=5, add_global_imgfeat=None, ): if dist.is_available() and local_rank != -1: rank = dist.get_rank() lmdb_file = os.path.join( features_path, "training_feat_part_" + str(rank) + ".lmdb") else: lmdb_file = os.path.join(features_path, "training_feat_all.lmdb") print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) ds = td.LocallyShuffleData(ds, cache) caption_path = os.path.join(annotations_path, "caption_train.json") preprocess_function = BertPreprocessBatch( caption_path, tokenizer, bert_model, seq_len, 36, self.num_dataset, objective=objective, num_locs=num_locs, ) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.add_global_imgfeat = add_global_imgfeat self.num_locs = num_locs
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): """load LMDB files, then generate batches??""" df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) # buffer_size df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) # multiprocess the data df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False): df = dataflow.LMDBData(lmdb_path, shuffle=False) size = df.size() if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = dataflow.LMDBDataPoint(df) df = PreprocessData(df, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
def __init__(self, split, batch_size, set_size): if split == 'train': lmdb_path = f'{data_path}/ModelNet40_train_1024_middle.lmdb' else: lmdb_path = f'{data_path}/ModelNet40_test_1024_middle.lmdb' df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) self.size = df.size() self.num_batches = self.size // batch_size if split == 'train': df = dataflow.LocallyShuffleData(df, buffer_size=2000) # buffer_size df = dataflow.PrefetchData(df, num_prefetch=500, num_proc=1) df = BatchData(df, batch_size, set_size // 8, set_size - set_size // 8) if split == 'train': df = dataflow.PrefetchDataZMQ(df, num_proc=8) df = dataflow.RepeatedData(df, -1) df.reset_state() self.generator = df.get_data()
def lmdb_dataflow(lmdb_path, batch_size, num_points, shuffle, task, render=False): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) size = df.size() if render: df = VirtualRenderData(df) if num_points is not None: df = ResampleData(df, num_points, task) if shuffle: df = dataflow.LocallyShuffleData(df, 1000) df = dataflow.PrefetchDataZMQ(df, 8) df = dataflow.BatchData(df, batch_size, use_list=True) df = dataflow.RepeatedData(df, -1) df.reset_state() return df, size
def process_dataset(args): sample_num = args.m split = args.dataset args.is_test = split == 'test' for cat_name in args.c: synset_id = synset_ids[cat_name] print('Processing', synset_id, cat_name) data_dir = os.path.join('../data', split + '_data', synset_id) label_dir = os.path.join('../data', split + '_label', synset_id) file_names = [ os.path.splitext(entry)[0] for entry in os.listdir(data_dir) if entry.endswith('.pts') ] df = graph_df(file_names, data_dir, label_dir, args) output_path = os.path.join( '../data/lmdb', '%s_%d_%s.lmdb' % (cat_name, sample_num, split)) if os.path.exists(output_path): os.system('rm -f %s' % output_path) df = dataflow.PrefetchDataZMQ(df, nr_proc=1) dftools.dump_dataflow_to_lmdb(df, output_path, write_frequency=args.w)
def lmdb_dataflow(lmdb_path, batch_size, input_size, output_size, is_training, test_speed=False, filter_rate=0): df = dataflow.LMDBSerializer.load(lmdb_path, shuffle=False) df = dataflow.MapData(df, lambda dp: [item for item in dp] + [random.random()]) size = df.size() print(size) if is_training: df = dataflow.LocallyShuffleData(df, buffer_size=2000) df = dataflow.PrefetchData(df, nr_prefetch=500, nr_proc=1) df = BatchData(df, batch_size, input_size, output_size) if is_training: df = dataflow.PrefetchDataZMQ(df, nr_proc=8) df = dataflow.RepeatedData(df, -1) if test_speed: dataflow.TestDataSpeed(df, size=1000).start() df.reset_state() return df, size
] elif train_or_valid in ['valid', 'validation']: synsets = [ line.strip() for line in open(data_path+'/imagenet_2012_validation_synset_labels.txt').readlines() ] self.datapoints = [ [base_path + 'Data/CLS-LOC/val/ILSVRC2012_val_%08d.JPEG' % (i+1), int(self.maps['synset2idx'][synset])] for i, synset in enumerate(synsets) ] else: raise ValueError('train_or_valid=%s is invalid argument must be a set train or valid' % train_or_valid) if __name__ == '__main__': import argparse import tensorpack.dataflow as df parser = argparse.ArgumentParser(description='Imagenet Dataset on Kakao Example') parser.add_argument('--service-code', type=str, required=True, help='licence key') parser.add_argument('--name', type=str, default='train', help='train or valid') args = parser.parse_args() ds = ILSVRC12(args.service_code, args.name).parallel(num_threads=32) if args.name in ['train', 'training']: ds = df.PrefetchDataZMQ(ds, nr_proc=2) df.TestDataSpeed(ds, size=5000).start()
def __init__( self, corpus_path, tokenizer, bert_model, seq_len, encoding="utf-8", visual_target=0, hard_negative=False, batch_size=512, shuffle=False, num_workers=25, cache=10000, drop_last=False, cuda=False, local_rank=-1, objective=0, visualization=False, ): TRAIN_DATASET_SIZE = 3119449 if dist.is_available() and local_rank != -1: num_replicas = dist.get_world_size() rank = dist.get_rank() lmdb_file = os.path.join( corpus_path, "training_feat_part_" + str(rank) + ".lmdb") else: lmdb_file = os.path.join(corpus_path, "gqa_resnext152_faster_rcnn_genome.lmdb") # lmdb_file = os.path.join(corpus_path, "validation_feat_all.lmdb") print("Loading from %s" % lmdb_file) ds = td.LMDBSerializer.load(lmdb_file, shuffle=False) self.num_dataset = len(ds) ds = td.LocallyShuffleData(ds, cache) caption_path = os.path.join(corpus_path, "caption_train.json") # caption_path = os.path.join(corpus_path, "caption_val.json") preprocess_function = BertPreprocessBatch( caption_path, tokenizer, bert_model, seq_len, 36, self.num_dataset, encoding="utf-8", visual_target=visual_target, objective=objective, ) ds = td.PrefetchData(ds, 5000, 1) ds = td.MapData(ds, preprocess_function) # self.ds = td.PrefetchData(ds, 1) ds = td.PrefetchDataZMQ(ds, num_workers) self.ds = td.BatchData(ds, batch_size) # self.ds = ds self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers
def dataflow(centroids, num_refs=3, num_process=16, shuffle=False): """ Compute graph to retrieve 3 reference and 1 target frames from Kinetics. Downsample grayscale frames to 256x256 and colorized frames to 32x32 feature maps in Lab colorspace. Cluster colors in colorized frames. Returned tensors are of shape (num_refs + 1, 256, 256, 1) and (num_refs + 1, 32, 32, 1) each. Instead of colorized output, cluster centroid index is returned. :return: (grayscale input, cluster indices for colorized output) """ config = Config.get_instance() kinetics_dirpath = config['data_dir']['kinetics'] # get frame and 3 prior reference frames with certain number of skips data = Kinetics(kinetics_dirpath, num_frames=num_refs + 1, skips=[0, 4, 4, 8][:num_refs + 1], shuffle=shuffle) # downsample frames to 256x256 data = df.MapDataComponent(data, ImageProcessor.resize(small_axis=256), index=1) data = df.MapDataComponent(data, ImageProcessor.crop(shape=(256, 256)), index=1) # data = df.MapDataComponent( # data, lambda images: [cv2.resize(image, (256, 256)) for image in images], index=1) # split frames into 3 references and 1 target frame # create deep copies of each at odd indices data = df.MapData( data, lambda dp: [ dp[1][:num_refs], copy.deepcopy(dp[1][:num_refs]), dp[1][num_refs:], copy.deepcopy(dp[1][num_refs:]) ]) # decolorize first set of reference and target frames as (256, 256, 1) for idx in [0, 2]: data = df.MapDataComponent( data, lambda images: [ np.int32(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)).reshape( 256, 256, 1) for image in images ], index=idx) for idx in [1, 3]: # downsample to 32x32 feature map data = df.MapDataComponent( data, lambda images: [cv2.resize(image, (32, 32)) for image in images], index=idx) # discard grayscale L space, keep only 'ab' from Lab color space # scale from 0-255 to 0-1 for clustering in next step data = df.MapDataComponent( data, lambda images: [ cv2.cvtColor(np.float32(image / 255.0), cv2.COLOR_BGR2Lab) [:, :, 1:] for image in images ], index=idx) # find nearest color cluster index for every pixel in ref and target data = df.MapDataComponent( data, lambda images: [get_cluster_labels(image, centroids) for image in images], index=idx) # combine ref and target frames into (num_refs + 1, dim, dim, 1) tensor # for both grayscale and colorized feature maps respectively # generates [input tensor, output tensor] data = df.MapData( data, lambda dp: [np.stack(dp[0] + dp[2], axis=0), np.stack(dp[1] + dp[3], axis=0)]) # important for tensorflow.data.dataset # does not do what it is supposed to do data = df.MapData(data, tuple) # prefetch 256 datapoints data = df.MultiProcessPrefetchData(data, nr_prefetch=256, nr_proc=num_process) data = df.PrefetchDataZMQ(data, nr_proc=1) return data
def data_pipe_3dcnn_block(fmri_files, confound_files, label_matrix, target_name=None, flag_cnn='3d', block_dura=1, hrf_delay=0, batch_size=32, data_type='train', nr_thread=4, buffer_size=10, dataselect_percent=1.0, seed=814, verbose=0): assert data_type in ['train', 'val', 'test'] assert flag_cnn in ['3d', '2d'] assert fmri_files is not None isTrain = data_type == 'train' isVal = data_type == 'val' isTest = data_type == 'test' buffer_size = int(min(len(fmri_files), buffer_size)) nr_thread = int(min(len(fmri_files), nr_thread)) ds0 = gen_fmri_file(fmri_files, confound_files, label_matrix, data_type=data_type, seed=seed) if target_name is None: target_name = np.unique(label_matrix) ##Subject_Num, Trial_Num = np.array(label_matrix).shape ####running the model start_time = time.clock() if flag_cnn == '2d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_block( dp, target_name, block_dura=block_dura, hrf_delay=hrf_delay), buffer_size=buffer_size, strict=True) elif flag_cnn == '3d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_3d_block( dp, target_name, block_dura=block_dura, hrf_delay=hrf_delay), buffer_size=buffer_size, strict=True) ds1 = dataflow.PrefetchData(ds1, buffer_size, 1) ##1 ds1 = split_samples(ds1, subject_num=len(fmri_files), batch_size=batch_size, dataselect_percent=dataselect_percent) dataflowSize = ds1.size() if isTrain: if verbose: print('%d #Trials/Samples per subject with %d channels in tc' % (ds1.Trial_Num, ds1.Block_dura)) Trial_Num = ds1.Trial_Num ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=Trial_Num * buffer_size, shuffle_interval=Trial_Num * buffer_size // 2) #//2 ds1 = dataflow.BatchData(ds1, batch_size=batch_size) if verbose: print('\n\nGenerating dataflow for %s datasets \n' % data_type) print('dataflowSize is ' + str(ds0.size())) print('Loading data using %d threads with %d buffer_size ... \n' % (nr_thread, buffer_size)) print('prefetch dataflowSize is ' + str(dataflowSize)) print('Time Usage of loading data in seconds: {} \n'.format( time.clock() - start_time)) if isTrain: ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=nr_thread) ##1 else: ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1) ##1 ##ds1._reset_once() ds1.reset_state() for df in ds1.get_data(): yield (df[0].astype('float32'), one_hot(df[1], len(target_name) + 1).astype('uint8')) ###end of tensorpack: multithread ##############################################################
def data_pipe_3dcnn_block(fmri_files, confound_files, label_matrix, target_name=None, flag_cnn='3d', block_dura=1, batch_size=32, data_type='train', nr_thread=nr_thread, buffer_size=buffer_size): assert data_type in ['train', 'val', 'test'] assert flag_cnn in ['3d', '2d'] assert fmri_files is not None isTrain = data_type == 'train' isVal = data_type == 'val' print('\n\nGenerating dataflow for %s datasets \n' % data_type) buffer_size = int(min(len(fmri_files), buffer_size)) nr_thread = int(min(len(fmri_files), nr_thread)) ds0 = gen_fmri_file(fmri_files, confound_files, label_matrix, data_type=data_type) print('dataflowSize is ' + str(ds0.size())) print('Loading data using %d threads with %d buffer_size ... \n' % (nr_thread, buffer_size)) if target_name is None: target_name = np.unique(label_matrix) ##Subject_Num, Trial_Num = np.array(label_matrix).shape ####running the model start_time = time.clock() if flag_cnn == '2d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_block( dp, target_name, block_dura=block_dura), buffer_size=buffer_size, strict=True) elif flag_cnn == '3d': ds1 = dataflow.MultiThreadMapData( ds0, nr_thread=nr_thread, map_func=lambda dp: map_load_fmri_image_3d_block( dp, target_name, block_dura=block_dura), buffer_size=buffer_size, strict=True) ds1 = dataflow.PrefetchData(ds1, buffer_size, 1) ds1 = split_samples(ds1) print('prefetch dataflowSize is ' + str(ds1.size())) if isTrain: print('%d #Trials/Samples per subject with %d channels in tc' % (ds1.Trial_Num, ds1.Block_dura)) Trial_Num = ds1.Trial_Num #ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=ds1.size() * buffer_size) ds1 = dataflow.LocallyShuffleData(ds1, buffer_size=Trial_Num * buffer_size, shuffle_interval=Trial_Num * buffer_size) #//2 ds1 = dataflow.BatchData(ds1, batch_size=batch_size, remainder=True) print('Time Usage of loading data in seconds: {} \n'.format(time.clock() - start_time)) ds1 = dataflow.PrefetchDataZMQ(ds1, nr_proc=1) #ds1._reset_once() ##ds1.reset_state() ''' for df in ds1.get_data(): if flag_cnn == '2d': yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name))) elif flag_cnn == '3d': yield (df[0].astype('float32'),to_categorical(df[1].astype('int32'), len(target_name))) ''' return ds1
logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(levelname)s] %(message)s', filename=args.log_filename) if args.augment: augmentors = fbresnet_augmentor(isTrain=True) else: augmentors = [ df.imgaug.Resize((128, 128)), ] ds = dataflow.dataset.ILSVRC12( args.service_code, 'train', shuffle=True).parallel(num_threads=args.threads) ds = df.AugmentImageComponent(ds, augmentors, copy=False) ds = df.PrefetchDataZMQ(ds, nr_proc=args.process) if args.view: ds = dataflow.utils.image.Viewer(ds, lambda x: x[1] == 4, 'label-4', prob=1.0, pos=(0, (128 + 64) * 0)) ds = dataflow.utils.image.Viewer(ds, lambda x: x[1] == 16, 'label-16', prob=1.0, pos=(0, (128 + 64) * 1)) ds = dataflow.utils.image.Viewer(ds, lambda x: x[1] == 32, 'label-32', prob=1.0,