예제 #1
0
def test_binarize():
    x = [
        0.49671415,
        -0.1382643,
        0.64768854,
        1.52302986,
        -0.23415337,
        -0.23413696,
        1.57921282,
        0.76743473,
    ]
    x = np.asarray(x, dtype="float64")
    expected = np.array([True, False, True, True, False, False, True, True])
    result = volume.binarize(x)
    assert_array_equal(expected, result)
    assert result.dtype == tf.float64
    result = volume.binarize(x.astype(np.float32))
    assert_array_equal(expected, result)
    assert result.dtype == tf.float32

    x = np.asarray([-2, 0, 2, 0, 2, -2, -1, 1], dtype=np.int32)
    expected = np.array([False, False, True, False, True, False, False, True])
    result = volume.binarize(x)
    assert_array_equal(expected, result)
    assert result.dtype == tf.int32
    result = volume.binarize(x.astype(np.int64))
    assert_array_equal(expected, result)
    assert result.dtype == tf.int64
예제 #2
0
def test_binarize():
    data = [0, 1, 2, 3, 4]
    assert_array_equal(binarize(data), [0, 1, 1, 1, 1])
    assert_array_equal(binarize(data, threshold=1), [0, 0, 1, 1, 1])
    assert_array_equal(binarize(data, threshold=3), [0, 0, 0, 0, 1])
    assert_array_equal(binarize(data, threshold=1, upper=4), [0, 0, 4, 4, 4])
    assert_array_equal(binarize(data, threshold=3, upper=9, lower=8),
                       [8, 8, 8, 8, 9])

    data = np.arange(100)
    data = binarize(data, upper=4, lower=1)
    assert_array_equal(np.unique(data), (1, 4))

    data = np.arange(100)
    data_c = binarize(data)
    assert_array_equal(np.unique(data_c), (0, 1))
    assert not np.array_equal(np.unique(data), (0, 1))
예제 #3
0
def get_dataset(
    file_pattern,
    n_classes,
    batch_size,
    volume_shape,
    scalar_label=False,
    block_shape=None,
    n_epochs=None,
    mapping=None,
    augment=False,
    shuffle_buffer_size=None,
    num_parallel_calls=AUTOTUNE,
):
    """Return `tf.data.Dataset` that preprocesses data for training or prediction.

    Labels are preprocessed for binary or multiclass segmentation according to
    `n_classes`.

    Parameters
    ----------
    file_pattern: str, expression that can be globbed to get TFRecords files
        for this dataset. For example 'data/training_*.tfrecords'.
    n_classes: int, number of classes to segment. Values of 1 and 2 indicate
        binary segmentation (foreground vs background), and values greater than
        2 indicate multiclass segmentation.
    batch_size: int, number of elements per batch.
    volume_shape: tuple of length 3, the shape of every volume in the TFRecords
        files. Every volume must have the same shape.
    scalar_label: boolean, if `True`, labels are scalars.
    block_shape: tuple of length 3, the shape of the non-overlapping sub-volumes
        to take from the full volumes. If None, do not separate the full volumes
        into sub-volumes. Separating into non-overlapping sub-volumes is useful
        (sometimes even necessary) to overcome memory limitations depending on
        the number of model parameters.
    n_epochs: int, number of epochs for the dataset to repeat. If None, the
        dataset will be repeated indefinitely.
    mapping: dict, mapping to replace label values. Values equal to a key in
        the mapping are replaced with the corresponding values in the mapping.
        Values not in `mapping.keys()` are replaced with zeros.
    augment: boolean, if true, apply random rigid transformations to the
        features and labels. The rigid transformations are applied to the full
        volumes.
    shuffle_buffer_size: int, buffer of full volumes to shuffle. If this is not
        None, then the list of files found by 'file_pattern' is also shuffled
        at every iteration.
    num_parallel_calls: int, number of parallel calls to make for data loading
        and processing.

    Returns
    -------
    `tf.data.Dataset` of features and labels. If block_shape is not None, the
    shape of features is `(batch_size, *block_shape, 1)` and the shape of labels
    is `(batch_size, *block_shape, n_classes)`. If block_shape is None, then
    the shape of features is `(batch_size, *volume_shape, 1)` and the shape of
    labels is `(batch_size, *volume_shape, n_classes)`. If `scalar_label` is `True,
    the shape of labels is always `(batch_size,)`.
    """

    files = glob.glob(file_pattern)
    if not files:
        raise ValueError(
            "no files found for pattern '{}'".format(file_pattern))

    # Create dataset of all TFRecord files. After this point, the dataset will have
    # two value per iteration: (feature, label).
    shuffle = bool(shuffle_buffer_size)
    compressed = _is_gzipped(files[0])
    dataset = tfrecord_dataset(
        file_pattern=file_pattern,
        volume_shape=volume_shape,
        shuffle=shuffle,
        scalar_label=scalar_label,
        compressed=compressed,
        num_parallel_calls=num_parallel_calls,
    )

    # Standard-score the features.
    dataset = dataset.map(lambda x, y: (standardize(x), y))

    # Separate into blocks, if requested.
    if block_shape is not None:
        if not scalar_label:
            dataset = dataset.map(
                lambda x, y:
                (to_blocks(x, block_shape), to_blocks(y, block_shape)),
                num_parallel_calls=num_parallel_calls,
            )
            # This step is necessary because separating into blocks adds a dimension.
            dataset = dataset.unbatch()
        if scalar_label:

            def _f(x, y):
                x = to_blocks(x, block_shape)
                n_blocks = x.shape[0]
                y = tf.repeat(y, n_blocks)
                return (x, y)

            dataset = dataset.map(_f, num_parallel_calls=num_parallel_calls)
            # This step is necessary because separating into blocks adds a dimension.
            dataset = dataset.unbatch()

    # Augment examples if requested.
    if augment:
        if not scalar_label:
            dataset = dataset.map(
                lambda x, y: tf.cond(
                    tf.random.uniform((1, )) > 0.5,
                    true_fn=lambda: apply_random_transform(x, y),
                    false_fn=lambda: (x, y),
                ),
                num_parallel_calls=num_parallel_calls,
            )
        else:
            dataset = dataset.map(
                lambda x, y: tf.cond(
                    tf.random.uniform((1, )) > 0.5,
                    true_fn=lambda: apply_random_transform_scalar_labels(x, y),
                    false_fn=lambda: (x, y),
                ),
                num_parallel_calls=num_parallel_calls,
            )

    # Binarize or replace labels according to mapping.
    if not scalar_label:
        if n_classes < 1:
            raise ValueError("n_classes must be > 0.")
        elif n_classes == 1:
            dataset = dataset.map(lambda x, y:
                                  (x, tf.expand_dims(binarize(y), -1)))
        elif n_classes == 2:
            dataset = dataset.map(lambda x, y:
                                  (x, tf.one_hot(binarize(y), n_classes)))
        elif n_classes > 2:
            if mapping is not None:
                dataset = dataset.map(lambda x, y:
                                      (x, replace(y, mapping=mapping)))
            dataset = dataset.map(lambda x, y: (x, tf.one_hot(y, n_classes)))

    # Add grayscale channel to features.
    # TODO: in the future, multi-channel features should be supported.
    dataset = dataset.map(lambda x, y: (tf.expand_dims(x, -1), y))

    # Prefetch data to overlap data production with data consumption. The
    # TensorFlow documentation suggests prefetching `batch_size` elements.
    dataset = dataset.prefetch(buffer_size=batch_size)

    # Batch the dataset, so each iteration gives `batch_size` elements. We drop
    # the remainder so that when training on multiple GPUs, the batch will
    # always be evenly divisible by the number of GPUs. Otherwise, the last
    # batch might have fewer than `batch_size` elements and will cause errors.
    if batch_size is not None:
        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)

    # Optionally shuffle. We also optionally shuffle the list of files.
    # The TensorFlow recommend shuffling and then repeating.
    if shuffle_buffer_size:
        dataset = dataset.shuffle(buffer_size=shuffle_buffer_size)

    # Repeat the dataset for n_epochs. If n_epochs is None, then repeat
    # indefinitely. If n_epochs is 1, then the dataset will only be iterated
    # through once.
    dataset = dataset.repeat(n_epochs)

    return dataset