def __init__(self, name, dataset, training=True, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, epoch_interval=1, collate_fn=None, stack_dim=0, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None): super().__init__() ds = df.RepeatedData(dataset, -1) ds = df.MultiProcessRunnerZMQ(ds, num_proc=num_workers, hwm=300) # ds = df.MultiThreadRunner(lambda: ds, num_prefetch=1024, num_thread=num_workers) ds = df.BatchData(ds, batch_size) self.ds = ds self.name = name self.training = training self.epoch_interval = epoch_interval self.stack_dim = stack_dim self.batches_per_epoch = len(dataset) // batch_size
def __init__(self, imagenet_dir, mode, transform, batch_size, shuffle=False, num_workers=4, cache=50000, drop_last=False): if drop_last: raise NotImplementedError("drop_last not implemented") # enumerate standard imagenet augmentors assert mode in ['train', 'val'], mode # open the lmdb file lmdb_loc = os.path.join(imagenet_dir, 'ILSVRC-%s.lmdb'%mode) ds = td.LMDBData(lmdb_loc, shuffle=False) if shuffle: ds = td.LocallyShuffleData(ds, cache) def f(x): img, label= td.LMDBSerializer._deserialize_lmdb(x) # img, label = x img = Image.open(BytesIO(img.tobytes())).convert('RGB') img = transform(img) return img, label # ds = td.MultiProcessMapDataZMQ(ds, num_proc=num_workers, map_func=f) ds = td.MultiThreadMapData(ds, num_thread=num_workers, map_func=f) # ds = td.MapData(ds, f) self.ds = td.BatchData(ds, batch_size, use_list=True, remainder=False) # self.ds.reset_state() self.batch_size = batch_size self.num_workers = num_workers self.ds.reset_state() self.ds_iter = iter(self.ds) self.N = self.ds.size() self.i = 0
def evaluate_on(model, datasource): images, gt = next(df.BatchData(datasource, datasource.size()).get_data()) gt = np.squeeze(gt) out = model.predict(images, verbose=True) _, dices, verbose = compute_dice_metric(preds=out, labels=gt) return verbose, dices
def load_BSDS500(): data_path = "../dataset/BSDS500" BSDS500 = df.dataset.BSDS500("train", data_dir=data_path, shuffle=True) BSDS500.reset_state() batches = df.BatchData(BSDS500, 8).get_data() images, labels = next(batches) image_utils.plot_semantics_data(images, labels, save_name="BSDS500_1.png")
def load_Cifar10(): data_path = "../dataset/Cifar10" Cifar10 = df.dataset.Cifar10("train", dir=data_path) Cifar10.reset_state() class_names = Cifar10.get_label_names() Cifar10_m = df.MapData(Cifar10, lambda dp: rot90(dp)) batches = df.BatchData(Cifar10_m, 9).get_data() images, labels = next(batches) image_utils.plot_classification_data(images, labels, class_names=class_names, save_name="cifar10.png")
def create_paired_parallel_dataflow_via_numpy(tf_dataset_1, tf_dataset_2, batch_size, augmentations, x_only=False, num_proc=cpu_count(), test_flow=True): X_1, y_1 = [], [] X_2, y_2 = [], [] # Materialize the dataset as a numpy array: this is memory intensive for large datasets! for data in tf_dataset_1: X_1.append(data[0].numpy()) y_1.append(data[1].numpy()) for data in tf_dataset_2: X_2.append(data[0].numpy()) y_2.append(data[1].numpy()) numpy_dataset_1 = list(zip(np.array(X_1), np.array(y_1))) numpy_dataset_2 = list(zip(np.array(X_2), np.array(y_2))) # Create a dataflow dataflow_1 = D.DataFromList(numpy_dataset_1) dataflow_2 = D.DataFromList(numpy_dataset_2) # Select some indices in the data if x_only: dataflow_1 = D.SelectComponent(dataflow_1, [0]) dataflow_2 = D.SelectComponent(dataflow_2, [0]) # Zip them dataflow = D.JoinData([dataflow_1, dataflow_2]) # Batch data dataflow = D.BatchData(dataflow, batch_size=batch_size) # Repeat data only once, we use a custom loop over epochs dataflow = D.RepeatedData(dataflow, 1) # Create a function for data augmentations if not x_only: daug = lambda x: (compose_augmentations(x[0], augmentations), x[1], compose_augmentations(x[2], augmentations), x[3]) else: daug = lambda x: (compose_augmentations(x[0], augmentations), compose_augmentations(x[1], augmentations)) # Map the function onto the data with parallelism dataflow = D.MultiProcessMapData(dataflow, num_proc=num_proc, map_func=daug, strict=True) if test_flow: # A quick runthrough of all the data D.TestDataSpeed(dataflow).start() return dataflow
def read_data(files=None, batch_size=1, window=2, random_rotation=False, repeat=False, shuffle_buffer=None, num_workers=1, cache_data=False): print(files[0:20], '...' if len(files) > 20 else '') # caching makes only sense if the data is finite if cache_data: if repeat == True: raise Exception("repeat must be False if cache_data==True") if random_rotation == True: raise Exception( "random_rotation must be False if cache_data==True") if num_workers != 1: raise Exception("num_workers must be 1 if cache_data==True") df = PhysicsSimDataFlow( files=files, random_rotation=random_rotation, shuffle=True if shuffle_buffer else False, window=window, ) if repeat: df = dataflow.RepeatedData(df, -1) if shuffle_buffer: df = dataflow.LocallyShuffleData(df, shuffle_buffer) if num_workers > 1: df = dataflow.MultiProcessRunnerZMQ(df, num_proc=num_workers) df = dataflow.BatchData(df, batch_size=batch_size, use_list=True) if cache_data: df = dataflow.CacheData(df) df.reset_state() return df
def _wrap_flow(self, dataset: RNGDataFlow ) -> RNGDataFlow: dataset = D.MultiProcessMapData( dataset, num_proc=12, map_func=lambda x: self._read_and_aug(x, self.augmentor), buffer_size=self.config['batch_size'] * 3, strict=True, ) if not self.debug: if self.train: dataset = D.RepeatedData(dataset, num = -1) #dataset = D.LocallyShuffleData(dataset, 2000) dataset = D.BatchData(dataset, self.config['batch_size']) dataset.reset_state() return dataset
def create_parallel_dataflow_via_numpy(tf_dataset, batch_size, augmentations=(), gpu_augmentations=(), x_only=False, num_proc=cpu_count(), test_flow=True): X, y = [], [] # Materialize the dataset as a numpy array: this is memory intensive for large datasets! for data in tf_dataset: X.append(data[0].numpy()) y.append(data[1].numpy()) numpy_dataset = list(zip(np.array(X), np.array(y))) # Create a dataflow dataflow = D.DataFromList(numpy_dataset) # Select some indices in the data if x_only: dataflow = D.SelectComponent(dataflow, [0]) # Batch data dataflow = D.BatchData(dataflow, batch_size=batch_size) # Repeat data only once, we use a custom loop over epochs dataflow = D.RepeatedData(dataflow, 1) # Create a function for data augmentations if not x_only: daug = lambda x: (compose_augmentations(x[0], augmentations), x[1]) else: daug = lambda x: (compose_augmentations(x[0], augmentations)) # Map the function onto the data with parallelism dataflow = D.MultiProcessMapData(dataflow, num_proc=num_proc, map_func=daug, strict=True) # Create a function for gpu data augmentations gpu_daug = lambda x: (compose_augmentations(x, gpu_augmentations)) # Map the function onto the data dataflow = D.MapDataComponent(dataflow, func=gpu_daug, index=0) if test_flow: # A quick runthrough of all the data D.TestDataSpeed(dataflow).start() return dataflow
def create_direct_dataflow( tf_dataset, batch_size, augmentations=(), gpu_augmentations=(), label_augmentations=(), num_proc=cpu_count(), test_flow=True, ): # Create a dataflow dataflow = D.DataFromGenerator(tf_dataset) # Map the tensors to numpy arrays dataflow = D.MapData(dataflow, func=lambda x: (x[0].numpy(), x[1].numpy())) # Batch the data dataflow = D.BatchData(dataflow, batch_size=batch_size) # Repeat the data only once, we use a custom loop over epochs dataflow = D.RepeatedData(dataflow, 1) # Create a function for data augmentations daug = lambda x: compose_augmentations((compose_augmentations( x[0], augmentations), x[1]), label_augmentations) # Map the function onto the data dataflow = D.MapData(dataflow, func=daug) # Create a function for gpu data augmentations gpu_daug = lambda x: (compose_augmentations(x, gpu_augmentations)) # Map the function onto the data dataflow = D.MapDataComponent(dataflow, func=gpu_daug, index=0) if test_flow: # A quick runthrough of all the data D.TestDataSpeed(dataflow, size=128).start() else: # Reset state manually dataflow.reset_state() return dataflow
def create_paired_direct_dataflow(tf_dataset_1, tf_dataset_2, batch_size, augmentations, x_only=False, num_proc=cpu_count(), test_flow=True, cache_dir1='', cache_dir2='', shuffle=True, shuffle_buffer=1000): # Cache the dataset first tf_dataset_1 = tf_dataset_1.cache(cache_dir1).prefetch( tf.data.experimental.AUTOTUNE) tf_dataset_2 = tf_dataset_2.cache(cache_dir2).prefetch( tf.data.experimental.AUTOTUNE) try: # Unbatch them tf_dataset_1 = tf_dataset_1.unbatch() tf_dataset_2 = tf_dataset_2.unbatch() except ValueError: pass if shuffle: # Shuffle the data tf_dataset_1 = tf_dataset_1.shuffle(shuffle_buffer, seed=1) tf_dataset_2 = tf_dataset_2.shuffle(shuffle_buffer, seed=2) # Run through to cache the datasets: this is necessary to do, otherwise it won't work for _ in tf_dataset_1.batch(batch_size): print('.', end='') pass for _ in tf_dataset_2.batch(batch_size): print('.', end='') pass # Create a dataflow dataflow_1 = D.DataFromGenerator(tf_dataset_1) dataflow_2 = D.DataFromGenerator(tf_dataset_2) # Map the tensors to numpy arrays dataflow_1 = D.MapData(dataflow_1, func=lambda x: (x[0].numpy(), x[1].numpy())) dataflow_2 = D.MapData(dataflow_2, func=lambda x: (x[0].numpy(), x[1].numpy())) # Select some indices in the data if x_only: dataflow_1 = D.SelectComponent(dataflow_1, [0]) dataflow_2 = D.SelectComponent(dataflow_2, [0]) # Zip them dataflow = D.JoinData([dataflow_1, dataflow_2]) # Batch data dataflow = D.BatchData(dataflow, batch_size=batch_size, remainder=True) # Repeat data only once, we use a custom loop over epochs dataflow = D.RepeatedData(dataflow, 1) # Create a function for data augmentations if not x_only: daug = lambda x: (compose_augmentations(x[0], augmentations), x[1], compose_augmentations(x[2], augmentations), x[3]) else: daug = lambda x: (compose_augmentations(x[0], augmentations), compose_augmentations(x[1], augmentations)) # Map the function onto the data dataflow = D.MapData(dataflow, func=daug) if test_flow: # A quick runthrough of all the data D.TestDataSpeed(dataflow).start() else: # Reset state manually dataflow.reset_state() return dataflow
def batch_predict(img_dir, out_dir, config, segmodel=None, image_size=None, image_extension='.png'): """ Generates segmentation results visualization for all images in a given folder :param segmodel: segmentation model :param img_dir: directory where source images are locate :param out_dir: path to save output segmentatiions, visualization will be saved on outdir/viz directory :param N_classes: number of classes that model predicts. see configuration :param image_size: image will be resized to image_size before prediction :return: """ N_classes = config.NUM_CLASSES if (segmodel is None): segmodel = load_tfkeras_model(config.MODEL_SAVE_DIR, file_name_prefix=config.NAME, model=None, custom_objects={}) if (not os.path.exists(out_dir)): os.mkdir(out_dir) test_data = DirectoryImagesTest(img_dir, image_extension) test_ds = SegmentationData(data=test_data.data, loadLabels=False, shuffle=False, isRGB=config.IS_RGB) resizer = [df.imgaug.Resize(image_size, interp=cv2.INTER_NEAREST) ] if image_size else [] test_ds = df.AugmentImageComponent(test_ds, augmentors=resizer) test_ds = df.MapDataComponent(test_ds, lambda x: x / 255.0, index=0) test_ds = df.MapDataComponent(test_ds, lambda x: np.expand_dims(x, -1), index=0) test_ds = df.BatchData(test_ds, batch_size=np.min([16, test_ds.size()])) batch_iter = test_ds.get_data() vizdir = os.path.join(out_dir, 'viz') if (not os.path.exists(vizdir)): os.mkdir(vizdir) image_name = lambda x: os.path.basename(x).split('.')[0] for batch in batch_iter: images, file_names = batch[0], batch[2] out = segmodel.predict(images, batch_size=4) images = images * 255 out_labelmap = np.argmax(out, axis=3).astype(np.uint8) for l, f in zip(out_labelmap, file_names): cv2.imwrite(os.path.join(out_dir, image_name(f) + '.png'), l) viz, _ = visualize_labels_overlay_labelmap(np.argmax(out, axis=3), images, N_classes, stack_images=False) for vim, f in zip(viz, file_names): cv2.imwrite(os.path.join(vizdir, 'v' + f), vim) print('Done')