def test_replace(): data = np.arange(5) mapping = {0: 10, 1: 20, 2: 30, 3: 40, 4: 30} output = volume.replace(data, mapping) assert_array_equal(output, [10, 20, 30, 40, 30]) # Test that overlapping keys and values gives correct result. data = np.arange(5) mapping = {0: 1, 1: 2, 2: 3, 3: 4} output = volume.replace(data, mapping) assert_array_equal(output, [1, 2, 3, 4, 4]) data = np.arange(8).reshape(2, 2, 2) mapping = {0: 100, 100: 10, 10: 5, 3: 5} outputs = volume.replace(data, mapping, zero=False) expected = data.copy() expected[0, 0, 0] = 100 expected[0, 1, 1] = 5 assert_array_equal(outputs, expected) # Zero values not in mapping values. outputs = volume.replace(data, mapping, zero=True) expected = np.zeros_like(data) expected[0, 0, 0] = 100 expected[0, 1, 1] = 5 expected[1, 0, 1] = 5 assert_array_equal(outputs, expected)
def test_replace(): data = np.arange(5) mapping = { 0: 10, 1: 20, 2: 30, 3: 40, 4: 30, } data = replace(data, mapping) assert_array_equal(data, [10, 20, 30, 40, 30]) # Test that overlapping keys and values gives correct result. data = np.arange(5) mapping = { 0: 1, 1: 2, 2: 3, 3: 4, } data = replace(data, mapping) assert_array_equal(data, [1, 2, 3, 4, 4])
def get_dataset( file_pattern, n_classes, batch_size, volume_shape, scalar_label=False, block_shape=None, n_epochs=None, mapping=None, augment=False, shuffle_buffer_size=None, num_parallel_calls=AUTOTUNE, ): """Return `tf.data.Dataset` that preprocesses data for training or prediction. Labels are preprocessed for binary or multiclass segmentation according to `n_classes`. Parameters ---------- file_pattern: str, expression that can be globbed to get TFRecords files for this dataset. For example 'data/training_*.tfrecords'. n_classes: int, number of classes to segment. Values of 1 and 2 indicate binary segmentation (foreground vs background), and values greater than 2 indicate multiclass segmentation. batch_size: int, number of elements per batch. volume_shape: tuple of length 3, the shape of every volume in the TFRecords files. Every volume must have the same shape. scalar_label: boolean, if `True`, labels are scalars. block_shape: tuple of length 3, the shape of the non-overlapping sub-volumes to take from the full volumes. If None, do not separate the full volumes into sub-volumes. Separating into non-overlapping sub-volumes is useful (sometimes even necessary) to overcome memory limitations depending on the number of model parameters. n_epochs: int, number of epochs for the dataset to repeat. If None, the dataset will be repeated indefinitely. mapping: dict, mapping to replace label values. Values equal to a key in the mapping are replaced with the corresponding values in the mapping. Values not in `mapping.keys()` are replaced with zeros. augment: boolean, if true, apply random rigid transformations to the features and labels. The rigid transformations are applied to the full volumes. shuffle_buffer_size: int, buffer of full volumes to shuffle. If this is not None, then the list of files found by 'file_pattern' is also shuffled at every iteration. num_parallel_calls: int, number of parallel calls to make for data loading and processing. Returns ------- `tf.data.Dataset` of features and labels. If block_shape is not None, the shape of features is `(batch_size, *block_shape, 1)` and the shape of labels is `(batch_size, *block_shape, n_classes)`. If block_shape is None, then the shape of features is `(batch_size, *volume_shape, 1)` and the shape of labels is `(batch_size, *volume_shape, n_classes)`. If `scalar_label` is `True, the shape of labels is always `(batch_size,)`. """ files = glob.glob(file_pattern) if not files: raise ValueError( "no files found for pattern '{}'".format(file_pattern)) # Create dataset of all TFRecord files. After this point, the dataset will have # two value per iteration: (feature, label). shuffle = bool(shuffle_buffer_size) compressed = _is_gzipped(files[0]) dataset = tfrecord_dataset( file_pattern=file_pattern, volume_shape=volume_shape, shuffle=shuffle, scalar_label=scalar_label, compressed=compressed, num_parallel_calls=num_parallel_calls, ) # Standard-score the features. dataset = dataset.map(lambda x, y: (standardize(x), y)) # Separate into blocks, if requested. if block_shape is not None: if not scalar_label: dataset = dataset.map( lambda x, y: (to_blocks(x, block_shape), to_blocks(y, block_shape)), num_parallel_calls=num_parallel_calls, ) # This step is necessary because separating into blocks adds a dimension. dataset = dataset.unbatch() if scalar_label: def _f(x, y): x = to_blocks(x, block_shape) n_blocks = x.shape[0] y = tf.repeat(y, n_blocks) return (x, y) dataset = dataset.map(_f, num_parallel_calls=num_parallel_calls) # This step is necessary because separating into blocks adds a dimension. dataset = dataset.unbatch() # Augment examples if requested. if augment: if not scalar_label: dataset = dataset.map( lambda x, y: tf.cond( tf.random.uniform((1, )) > 0.5, true_fn=lambda: apply_random_transform(x, y), false_fn=lambda: (x, y), ), num_parallel_calls=num_parallel_calls, ) else: dataset = dataset.map( lambda x, y: tf.cond( tf.random.uniform((1, )) > 0.5, true_fn=lambda: apply_random_transform_scalar_labels(x, y), false_fn=lambda: (x, y), ), num_parallel_calls=num_parallel_calls, ) # Binarize or replace labels according to mapping. if not scalar_label: if n_classes < 1: raise ValueError("n_classes must be > 0.") elif n_classes == 1: dataset = dataset.map(lambda x, y: (x, tf.expand_dims(binarize(y), -1))) elif n_classes == 2: dataset = dataset.map(lambda x, y: (x, tf.one_hot(binarize(y), n_classes))) elif n_classes > 2: if mapping is not None: dataset = dataset.map(lambda x, y: (x, replace(y, mapping=mapping))) dataset = dataset.map(lambda x, y: (x, tf.one_hot(y, n_classes))) # Add grayscale channel to features. # TODO: in the future, multi-channel features should be supported. dataset = dataset.map(lambda x, y: (tf.expand_dims(x, -1), y)) # Prefetch data to overlap data production with data consumption. The # TensorFlow documentation suggests prefetching `batch_size` elements. dataset = dataset.prefetch(buffer_size=batch_size) # Batch the dataset, so each iteration gives `batch_size` elements. We drop # the remainder so that when training on multiple GPUs, the batch will # always be evenly divisible by the number of GPUs. Otherwise, the last # batch might have fewer than `batch_size` elements and will cause errors. if batch_size is not None: dataset = dataset.batch(batch_size=batch_size, drop_remainder=True) # Optionally shuffle. We also optionally shuffle the list of files. # The TensorFlow recommend shuffling and then repeating. if shuffle_buffer_size: dataset = dataset.shuffle(buffer_size=shuffle_buffer_size) # Repeat the dataset for n_epochs. If n_epochs is None, then repeat # indefinitely. If n_epochs is 1, then the dataset will only be iterated # through once. dataset = dataset.repeat(n_epochs) return dataset
def validate_from_filepath( filepath, predictor, block_shape, n_classes, mapping_y, return_variance=False, return_entropy=False, return_array_from_images=False, n_samples=1, normalizer=normalize_zero_one, batch_size=4, dtype=DT_X, ): """Computes dice for a prediction compared to a ground truth image. Args: filepath: tuple, tupel of paths to existing neuroimaging volume (index 0) and ground truth (index 1). predictor: TensorFlow Predictor object, predictor from previously trained model. n_classes: int, number of classifications the model is trained to output. mapping_y: path-like, path to csv mapping file per command line argument. block_shape: tuple of len 3, shape of blocks on which to predict. return_variance: Boolean. If set True, it returns the running population variance along with mean. Note, if the n_samples is smaller or equal to 1, the variance will not be returned; instead it will return None return_entropy: Boolean. If set True, it returns the running entropy. along with mean. return_array_from_images: Boolean. If set True and the given input is either image, filepath, or filepaths, it will return arrays of [mean, variance, entropy] instead of images of them. Also, if the input is array, it will simply return array, whether or not this flag is True or False. n_samples: The number of sampling. If set as 1, it will just return the single prediction value. normalizer: callable, function that accepts an ndarray and returns an ndarray. Called before separating volume into blocks. batch_size: int, number of sub-volumes per batch for prediction. dtype: str or dtype object, dtype of features. Returns: `nibabel.spatialimages.SpatialImage` or arrays of predictions of mean, variance(optional), and entropy (optional). """ if not Path(filepath[0]).is_file(): raise FileNotFoundError("could not find file {}".format(filepath[0])) img = nib.load(filepath[0]) y = read_volume(filepath[1], dtype=np.int32) outputs = _predict(inputs=img, predictor=predictor, block_shape=block_shape, return_variance=return_variance, return_entropy=return_entropy, return_array_from_images=return_array_from_images, n_samples=n_samples, normalizer=normalizer, batch_size=batch_size) prediction_image = outputs[0].get_data() y = replace(y, read_mapping(mapping_y)) dice = get_dice_for_images(prediction_image, y, n_classes) return outputs, dice