Exemplo n.º 1
0
def main():
    args = parse_args()

    h5_datasets = load_h5_dataset(args.input)

    def get_output_filepath(input_name, partition_name):
        dirname, filename = os.path.split(input_name)
        basename, extension = os.path.splitext(filename)
        assert_equal(extension, '.h5')

        return os.path.join(dirname, "{}_{}{}".format(basename,
                                                      partition_name,
                                                      '.npy'))

    for dataset, partition_name in safe_izip(h5_datasets, h5_datasets._fields):
        output_filepath = get_output_filepath(args.input, partition_name)
        memmap = make_memmap_file(output_filepath,
                                  dataset.num_examples(),
                                  dataset.names,
                                  dataset.formats)

        memmap_tensors = [memmap[name] for name in dataset.names]
        for in_tensor, out_tensor in safe_izip(dataset.tensors,
                                               memmap_tensors):
            assert_equal(out_tensor.shape, in_tensor.shape)
            out_tensor[...] = in_tensor


        print("Wrote {}".format(output_filepath))
Exemplo n.º 2
0
def write_memmaps(full_dataset, args, rng):

    def get_partition_path(output_dir, input_path, partition_name):
        basename = os.path.splitext(os.path.split(input_path)[1])[0]
        return os.path.join(output_dir,
                            "{}_split_{}_{}.npy".format(basename,
                                                        args.ratio,
                                                        partition_name))

    partition_masks = get_partition_masks(full_dataset, args.ratio, rng)

    for partition_name, partition_mask in safe_izip(args.partition_names,
                                                    partition_masks):
        partition_path = get_partition_path(args.output_dir,
                                            args.input,
                                            partition_name)

        memmap = make_memmap_file(partition_path,
                                  numpy.count_nonzero(partition_mask),
                                  full_dataset.names,
                                  full_dataset.formats)


        for full_tensor, name, fmt in safe_izip(full_dataset.tensors,
                                                full_dataset.names,
                                                full_dataset.formats):
            partition_tensor = memmap[name]
            batch_slice = get_batch_slice(fmt, partition_mask)
            partition_tensor[...] = full_tensor[batch_slice]
Exemplo n.º 3
0
    def allocate_memmap_file(args):
        '''
        Creates a properly shaped & typed h5py Dataset file.

        Returns
        -------
        rval: numpy.memmap
          A recordarray memmap.
        '''
        for input_path in args.inputs:
            assert_true(os.path.isfile(input_path))

        output_filepath = get_output_filepath(args)
        assert_equal(os.path.splitext(output_filepath)[1], '.npy')

        fmt = None

        # Get format of each file, make sure they're all the same.
        for (input_path,
             subsample,
             scale,
             rotation,
             grid) in safe_izip(args.inputs,
                                args.subsamples,
                                args.scales,
                                args.rotations,
                                args.grids):
            iterator = FrameIterator(input_path)
            frame = iterator.next()
            cell_batch = transform_image(frame, scale, rotation, grid)
            cells_per_frame = cell_batch.shape[0]
            this_fmt = DenseFormat(axes=('b', '0', '1', 'c'),
                                   shape=((-1, ) + cell_batch.shape[1:]),
                                   dtype=cell_batch.dtype)
            if fmt is None:
                fmt = this_fmt
            else:
                assert_equal(this_fmt,
                             fmt,
                             "All video files (after their respective grid, "
                             "scale, and rotation transforms, must yield the "
                             "same image format")

        num_cells = 0

        # Counts and sets num_cells
        for input_path in args.inputs:
            for frame_number, frame in enumerate(FrameIterator(input_path)):
                if frame_number % subsample == 0:
                    num_cells += cells_per_frame

        return make_memmap_file(path=output_filepath,
                                num_examples=num_cells,
                                tensor_names=['images'],
                                tensor_formats=[fmt])
def main():

    args = parse_args()

    images = load_npy_file(args.images)
    labels = load_npy_file(args.labels)

    blank_image = numpy.zeros(images.shape[1:])

    num_examples = labels.shape[0]

    images_fmt = DenseFormat(axes=('b', '0', '1', 'c'),
                             shape=((-1, ) + images.shape[1:]),
                             dtype=images.dtype)

    labels_fmt = DenseFormat(axes=('b', 'f'),
                             shape=(-1, labels.shape[1]),
                             dtype=labels.dtype)

    print("Allocating output file.")

    output_memmap = make_memmap_file(args.output,
                                     num_examples + 1,
                                     ['images', 'labels'],
                                     [images_fmt, labels_fmt])

    print("Copying {} images and labels.".format(num_examples))

    assert_equal(images_fmt.axes.index('b'), 0)
    assert_equal(labels_fmt.axes, ('b', 'f'))
    assert_is_subdtype(labels_fmt.dtype, numpy.signedinteger)
    max_category = labels[:, 0].max()

    blank_label = numpy.empty(labels.shape[1], dtype=labels.dtype)
    blank_label[0] = max_category + 1
    blank_label[1] = 0
    blank_label[2:] = -1

    output_memmap['images'][0, ...] = 0
    output_memmap['labels'][0, ...] = blank_label

    output_memmap['images'][1:, ...] = images
    output_memmap['labels'][1:, ...] = labels

    print("Wrote output to {}".format(args.output))
Exemplo n.º 5
0
def main():
    '''
    Entry point of this script.
    '''

    args = parse_args()

    full_dataset = MemmapDataset(args.input)

    rng = numpy.random.RandomState(args.seed)

    if args.foreground:
        assert_equal(full_dataset.names, ('images', 'labels'))
        assert_equal(len(full_dataset.formats[1].axes), 2)
    else:
        assert_equal(full_dataset.names, ('images',))

    for fmt in full_dataset.formats:
        assert_equal(fmt.axes[0], 'b')

    assert_equal(len(full_dataset.formats[0].axes), 4)

    # If this is a foreground dataset, expect (and assert) that the first
    # example is a "blank" image.
    if args.foreground:
        blank_image = full_dataset.tensors[0][0:1, ...]
        blank_label = full_dataset.tensors[1][0:1, ...]

        # split off the blank example from full_dataset
        full_dataset = Dataset(tensors=[t[1:, ...]
                                        for t in full_dataset.tensors],
                               names=full_dataset.names,
                               formats=full_dataset.formats)

        # Check that blank_image is all zeros
        assert_true(numpy.all(blank_image == 0))

        # check that blank_label is [max class + 1, 0, -1, ... -1]
        max_category = full_dataset.tensors[1][:, 0].max()
        assert_equal(blank_label[0, 0], max_category + 1)
        assert_equal(blank_label[0, 1], 0)
        assert_true(numpy.all(blank_label[0, 2:] == -1))

        # check that full_dataset contains no blank-object labels
        id_labels = full_dataset.tensors[1][:, :2]
        blank_id = blank_label[:, :2]
        assert_false((id_labels == blank_id).all(axis=1).any())

    else:
        blank_image = numpy.zeros_like(full_dataset.tensors[0][0:1, ...])

    # check that full_dataset has no blank images
    images = full_dataset.tensors[0]
    assert_false((images == blank_image).all(axis=(1, 2, 3)).any())

    partition_masks = get_partition_masks(full_dataset, args.ratio, rng)

    for partition_name, partition_mask in safe_izip(args.partition_names,
                                                    partition_masks):
        basename = os.path.splitext(os.path.split(args.input)[1])[0]
        partition_path = os.path.join(
            args.output_dir,
            "{}_split_{}_{}.npy".format(basename,
                                        args.ratio,
                                        partition_name))

        num_examples = numpy.count_nonzero(partition_mask)

        if args.foreground:
            num_examples += 1

        memmap = make_memmap_file(partition_path,
                                  num_examples,
                                  full_dataset.names,
                                  full_dataset.formats)

        if args.foreground:
            for (full_tensor,
                 name,
                 fmt,
                 blank) in safe_izip(full_dataset.tensors,
                                     full_dataset.names,
                                     full_dataset.formats,
                                     [blank_image, blank_label]):
                partition_tensor = memmap[name]
                partition_tensor[0:1, ...] = blank
                partition_tensor[1:, ...] = full_tensor[partition_mask, ...]
        else:
            for full_tensor, name, fmt in safe_izip(full_dataset.tensors,
                                                    full_dataset.names,
                                                    full_dataset.formats):
                memmap[name][...] = full_tensor[partition_mask, ...]

        print("Wrote '{}' partition to {}".format(partition_name,
                                                  partition_path))
def main():
    '''
    Entry point of this script.
    '''

    args = parse_args()

    def get_image_filepaths(directory):
        contents = [os.path.join(args.input_dir, f)
                    for f in os.listdir(args.input_dir)]
        return [c for c in contents if os.path.isfile(c)]

    image_filepaths = get_image_filepaths(args.input_dir)
    assert_greater(len(image_filepaths), 0)

    image_shape = None
    cell_format = None

    for image_filepath in image_filepaths:
        image = imread(image_filepath)

        if image_shape is None:
            image_shape = numpy.asarray(image.shape)
            cell_batch_shape = transform_image(image,
                                               args.scale,
                                               args.grid).shape
            cell_format = DenseFormat(axes=['b', '0', '1', 'c'],
                                      shape=(-1, ) + cell_batch_shape[1:],
                                      dtype=image.dtype)
        else:
            assert_equal(tuple(image.shape),
                         tuple(image_shape),
                         "found different-shaped images in the image list.")

    cells_per_image = numpy.prod(args.grid)
    num_cells = len(image_filepaths) * cells_per_image

    memmap = make_memmap_file(path=get_output_filepath(args),
                              num_examples=num_cells,
                              tensor_names=['images'],
                              tensor_formats=[cell_format])

    num_copied_images = 0

    for image_number, image_filepath in enumerate(image_filepaths):
        rows_slice = slice(image_number * cells_per_image,
                           (image_number + 1) * cells_per_image)

        image = imread(image_filepath)
        memmap['images'][rows_slice, ...] = transform_image(image,
                                                            args.scale,
                                                            args.grid)
        num_copied_images += 1
        print("copied from image {} of {}".format(num_copied_images,
                                                  len(image_filepaths)))

    if not args.no_shuffle:
        print("shuffling {} images in-place on disk. This could take some "
              "time if the output lives on a spinning disk, rather than "
              "an SSD or RAM disk".format(memmap.shape[0]))

        rng = numpy.random.RandomState(425399)
        rng.shuffle(memmap)  # shuffles in-place along just the first dimension

        print("... done shuffling.")

    print("Wrote {} images with shape {} to {}.".format(memmap.shape[0],
                                                        cell_format.shape[1:],
                                                        memmap.filename))