def test_binarize(): x = [ 0.49671415, -0.1382643, 0.64768854, 1.52302986, -0.23415337, -0.23413696, 1.57921282, 0.76743473, ] x = np.asarray(x, dtype="float64") expected = np.array([True, False, True, True, False, False, True, True]) result = volume.binarize(x) assert_array_equal(expected, result) assert result.dtype == tf.float64 result = volume.binarize(x.astype(np.float32)) assert_array_equal(expected, result) assert result.dtype == tf.float32 x = np.asarray([-2, 0, 2, 0, 2, -2, -1, 1], dtype=np.int32) expected = np.array([False, False, True, False, True, False, False, True]) result = volume.binarize(x) assert_array_equal(expected, result) assert result.dtype == tf.int32 result = volume.binarize(x.astype(np.int64)) assert_array_equal(expected, result) assert result.dtype == tf.int64
def test_binarize(): data = [0, 1, 2, 3, 4] assert_array_equal(binarize(data), [0, 1, 1, 1, 1]) assert_array_equal(binarize(data, threshold=1), [0, 0, 1, 1, 1]) assert_array_equal(binarize(data, threshold=3), [0, 0, 0, 0, 1]) assert_array_equal(binarize(data, threshold=1, upper=4), [0, 0, 4, 4, 4]) assert_array_equal(binarize(data, threshold=3, upper=9, lower=8), [8, 8, 8, 8, 9]) data = np.arange(100) data = binarize(data, upper=4, lower=1) assert_array_equal(np.unique(data), (1, 4)) data = np.arange(100) data_c = binarize(data) assert_array_equal(np.unique(data_c), (0, 1)) assert not np.array_equal(np.unique(data), (0, 1))
def get_dataset( file_pattern, n_classes, batch_size, volume_shape, scalar_label=False, block_shape=None, n_epochs=None, mapping=None, augment=False, shuffle_buffer_size=None, num_parallel_calls=AUTOTUNE, ): """Return `tf.data.Dataset` that preprocesses data for training or prediction. Labels are preprocessed for binary or multiclass segmentation according to `n_classes`. Parameters ---------- file_pattern: str, expression that can be globbed to get TFRecords files for this dataset. For example 'data/training_*.tfrecords'. n_classes: int, number of classes to segment. Values of 1 and 2 indicate binary segmentation (foreground vs background), and values greater than 2 indicate multiclass segmentation. batch_size: int, number of elements per batch. volume_shape: tuple of length 3, the shape of every volume in the TFRecords files. Every volume must have the same shape. scalar_label: boolean, if `True`, labels are scalars. block_shape: tuple of length 3, the shape of the non-overlapping sub-volumes to take from the full volumes. If None, do not separate the full volumes into sub-volumes. Separating into non-overlapping sub-volumes is useful (sometimes even necessary) to overcome memory limitations depending on the number of model parameters. n_epochs: int, number of epochs for the dataset to repeat. If None, the dataset will be repeated indefinitely. mapping: dict, mapping to replace label values. Values equal to a key in the mapping are replaced with the corresponding values in the mapping. Values not in `mapping.keys()` are replaced with zeros. augment: boolean, if true, apply random rigid transformations to the features and labels. The rigid transformations are applied to the full volumes. shuffle_buffer_size: int, buffer of full volumes to shuffle. If this is not None, then the list of files found by 'file_pattern' is also shuffled at every iteration. num_parallel_calls: int, number of parallel calls to make for data loading and processing. Returns ------- `tf.data.Dataset` of features and labels. If block_shape is not None, the shape of features is `(batch_size, *block_shape, 1)` and the shape of labels is `(batch_size, *block_shape, n_classes)`. If block_shape is None, then the shape of features is `(batch_size, *volume_shape, 1)` and the shape of labels is `(batch_size, *volume_shape, n_classes)`. If `scalar_label` is `True, the shape of labels is always `(batch_size,)`. """ files = glob.glob(file_pattern) if not files: raise ValueError( "no files found for pattern '{}'".format(file_pattern)) # Create dataset of all TFRecord files. After this point, the dataset will have # two value per iteration: (feature, label). shuffle = bool(shuffle_buffer_size) compressed = _is_gzipped(files[0]) dataset = tfrecord_dataset( file_pattern=file_pattern, volume_shape=volume_shape, shuffle=shuffle, scalar_label=scalar_label, compressed=compressed, num_parallel_calls=num_parallel_calls, ) # Standard-score the features. dataset = dataset.map(lambda x, y: (standardize(x), y)) # Separate into blocks, if requested. if block_shape is not None: if not scalar_label: dataset = dataset.map( lambda x, y: (to_blocks(x, block_shape), to_blocks(y, block_shape)), num_parallel_calls=num_parallel_calls, ) # This step is necessary because separating into blocks adds a dimension. dataset = dataset.unbatch() if scalar_label: def _f(x, y): x = to_blocks(x, block_shape) n_blocks = x.shape[0] y = tf.repeat(y, n_blocks) return (x, y) dataset = dataset.map(_f, num_parallel_calls=num_parallel_calls) # This step is necessary because separating into blocks adds a dimension. dataset = dataset.unbatch() # Augment examples if requested. if augment: if not scalar_label: dataset = dataset.map( lambda x, y: tf.cond( tf.random.uniform((1, )) > 0.5, true_fn=lambda: apply_random_transform(x, y), false_fn=lambda: (x, y), ), num_parallel_calls=num_parallel_calls, ) else: dataset = dataset.map( lambda x, y: tf.cond( tf.random.uniform((1, )) > 0.5, true_fn=lambda: apply_random_transform_scalar_labels(x, y), false_fn=lambda: (x, y), ), num_parallel_calls=num_parallel_calls, ) # Binarize or replace labels according to mapping. if not scalar_label: if n_classes < 1: raise ValueError("n_classes must be > 0.") elif n_classes == 1: dataset = dataset.map(lambda x, y: (x, tf.expand_dims(binarize(y), -1))) elif n_classes == 2: dataset = dataset.map(lambda x, y: (x, tf.one_hot(binarize(y), n_classes))) elif n_classes > 2: if mapping is not None: dataset = dataset.map(lambda x, y: (x, replace(y, mapping=mapping))) dataset = dataset.map(lambda x, y: (x, tf.one_hot(y, n_classes))) # Add grayscale channel to features. # TODO: in the future, multi-channel features should be supported. dataset = dataset.map(lambda x, y: (tf.expand_dims(x, -1), y)) # Prefetch data to overlap data production with data consumption. The # TensorFlow documentation suggests prefetching `batch_size` elements. dataset = dataset.prefetch(buffer_size=batch_size) # Batch the dataset, so each iteration gives `batch_size` elements. We drop # the remainder so that when training on multiple GPUs, the batch will # always be evenly divisible by the number of GPUs. Otherwise, the last # batch might have fewer than `batch_size` elements and will cause errors. if batch_size is not None: dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) # Optionally shuffle. We also optionally shuffle the list of files. # The TensorFlow recommend shuffling and then repeating. if shuffle_buffer_size: dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) # Repeat the dataset for n_epochs. If n_epochs is None, then repeat # indefinitely. If n_epochs is 1, then the dataset will only be iterated # through once. dataset = dataset.repeat(n_epochs) return dataset