def main(): args = parse_args() h5_datasets = load_h5_dataset(args.input) def get_output_filepath(input_name, partition_name): dirname, filename = os.path.split(input_name) basename, extension = os.path.splitext(filename) assert_equal(extension, '.h5') return os.path.join(dirname, "{}_{}{}".format(basename, partition_name, '.npy')) for dataset, partition_name in safe_izip(h5_datasets, h5_datasets._fields): output_filepath = get_output_filepath(args.input, partition_name) memmap = make_memmap_file(output_filepath, dataset.num_examples(), dataset.names, dataset.formats) memmap_tensors = [memmap[name] for name in dataset.names] for in_tensor, out_tensor in safe_izip(dataset.tensors, memmap_tensors): assert_equal(out_tensor.shape, in_tensor.shape) out_tensor[...] = in_tensor print("Wrote {}".format(output_filepath))
def write_memmaps(full_dataset, args, rng): def get_partition_path(output_dir, input_path, partition_name): basename = os.path.splitext(os.path.split(input_path)[1])[0] return os.path.join(output_dir, "{}_split_{}_{}.npy".format(basename, args.ratio, partition_name)) partition_masks = get_partition_masks(full_dataset, args.ratio, rng) for partition_name, partition_mask in safe_izip(args.partition_names, partition_masks): partition_path = get_partition_path(args.output_dir, args.input, partition_name) memmap = make_memmap_file(partition_path, numpy.count_nonzero(partition_mask), full_dataset.names, full_dataset.formats) for full_tensor, name, fmt in safe_izip(full_dataset.tensors, full_dataset.names, full_dataset.formats): partition_tensor = memmap[name] batch_slice = get_batch_slice(fmt, partition_mask) partition_tensor[...] = full_tensor[batch_slice]
def allocate_memmap_file(args): ''' Creates a properly shaped & typed h5py Dataset file. Returns ------- rval: numpy.memmap A recordarray memmap. ''' for input_path in args.inputs: assert_true(os.path.isfile(input_path)) output_filepath = get_output_filepath(args) assert_equal(os.path.splitext(output_filepath)[1], '.npy') fmt = None # Get format of each file, make sure they're all the same. for (input_path, subsample, scale, rotation, grid) in safe_izip(args.inputs, args.subsamples, args.scales, args.rotations, args.grids): iterator = FrameIterator(input_path) frame = iterator.next() cell_batch = transform_image(frame, scale, rotation, grid) cells_per_frame = cell_batch.shape[0] this_fmt = DenseFormat(axes=('b', '0', '1', 'c'), shape=((-1, ) + cell_batch.shape[1:]), dtype=cell_batch.dtype) if fmt is None: fmt = this_fmt else: assert_equal(this_fmt, fmt, "All video files (after their respective grid, " "scale, and rotation transforms, must yield the " "same image format") num_cells = 0 # Counts and sets num_cells for input_path in args.inputs: for frame_number, frame in enumerate(FrameIterator(input_path)): if frame_number % subsample == 0: num_cells += cells_per_frame return make_memmap_file(path=output_filepath, num_examples=num_cells, tensor_names=['images'], tensor_formats=[fmt])
def main(): args = parse_args() images = load_npy_file(args.images) labels = load_npy_file(args.labels) blank_image = numpy.zeros(images.shape[1:]) num_examples = labels.shape[0] images_fmt = DenseFormat(axes=('b', '0', '1', 'c'), shape=((-1, ) + images.shape[1:]), dtype=images.dtype) labels_fmt = DenseFormat(axes=('b', 'f'), shape=(-1, labels.shape[1]), dtype=labels.dtype) print("Allocating output file.") output_memmap = make_memmap_file(args.output, num_examples + 1, ['images', 'labels'], [images_fmt, labels_fmt]) print("Copying {} images and labels.".format(num_examples)) assert_equal(images_fmt.axes.index('b'), 0) assert_equal(labels_fmt.axes, ('b', 'f')) assert_is_subdtype(labels_fmt.dtype, numpy.signedinteger) max_category = labels[:, 0].max() blank_label = numpy.empty(labels.shape[1], dtype=labels.dtype) blank_label[0] = max_category + 1 blank_label[1] = 0 blank_label[2:] = -1 output_memmap['images'][0, ...] = 0 output_memmap['labels'][0, ...] = blank_label output_memmap['images'][1:, ...] = images output_memmap['labels'][1:, ...] = labels print("Wrote output to {}".format(args.output))
def main(): ''' Entry point of this script. ''' args = parse_args() full_dataset = MemmapDataset(args.input) rng = numpy.random.RandomState(args.seed) if args.foreground: assert_equal(full_dataset.names, ('images', 'labels')) assert_equal(len(full_dataset.formats[1].axes), 2) else: assert_equal(full_dataset.names, ('images',)) for fmt in full_dataset.formats: assert_equal(fmt.axes[0], 'b') assert_equal(len(full_dataset.formats[0].axes), 4) # If this is a foreground dataset, expect (and assert) that the first # example is a "blank" image. if args.foreground: blank_image = full_dataset.tensors[0][0:1, ...] blank_label = full_dataset.tensors[1][0:1, ...] # split off the blank example from full_dataset full_dataset = Dataset(tensors=[t[1:, ...] for t in full_dataset.tensors], names=full_dataset.names, formats=full_dataset.formats) # Check that blank_image is all zeros assert_true(numpy.all(blank_image == 0)) # check that blank_label is [max class + 1, 0, -1, ... -1] max_category = full_dataset.tensors[1][:, 0].max() assert_equal(blank_label[0, 0], max_category + 1) assert_equal(blank_label[0, 1], 0) assert_true(numpy.all(blank_label[0, 2:] == -1)) # check that full_dataset contains no blank-object labels id_labels = full_dataset.tensors[1][:, :2] blank_id = blank_label[:, :2] assert_false((id_labels == blank_id).all(axis=1).any()) else: blank_image = numpy.zeros_like(full_dataset.tensors[0][0:1, ...]) # check that full_dataset has no blank images images = full_dataset.tensors[0] assert_false((images == blank_image).all(axis=(1, 2, 3)).any()) partition_masks = get_partition_masks(full_dataset, args.ratio, rng) for partition_name, partition_mask in safe_izip(args.partition_names, partition_masks): basename = os.path.splitext(os.path.split(args.input)[1])[0] partition_path = os.path.join( args.output_dir, "{}_split_{}_{}.npy".format(basename, args.ratio, partition_name)) num_examples = numpy.count_nonzero(partition_mask) if args.foreground: num_examples += 1 memmap = make_memmap_file(partition_path, num_examples, full_dataset.names, full_dataset.formats) if args.foreground: for (full_tensor, name, fmt, blank) in safe_izip(full_dataset.tensors, full_dataset.names, full_dataset.formats, [blank_image, blank_label]): partition_tensor = memmap[name] partition_tensor[0:1, ...] = blank partition_tensor[1:, ...] = full_tensor[partition_mask, ...] else: for full_tensor, name, fmt in safe_izip(full_dataset.tensors, full_dataset.names, full_dataset.formats): memmap[name][...] = full_tensor[partition_mask, ...] print("Wrote '{}' partition to {}".format(partition_name, partition_path))
def main(): ''' Entry point of this script. ''' args = parse_args() def get_image_filepaths(directory): contents = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)] return [c for c in contents if os.path.isfile(c)] image_filepaths = get_image_filepaths(args.input_dir) assert_greater(len(image_filepaths), 0) image_shape = None cell_format = None for image_filepath in image_filepaths: image = imread(image_filepath) if image_shape is None: image_shape = numpy.asarray(image.shape) cell_batch_shape = transform_image(image, args.scale, args.grid).shape cell_format = DenseFormat(axes=['b', '0', '1', 'c'], shape=(-1, ) + cell_batch_shape[1:], dtype=image.dtype) else: assert_equal(tuple(image.shape), tuple(image_shape), "found different-shaped images in the image list.") cells_per_image = numpy.prod(args.grid) num_cells = len(image_filepaths) * cells_per_image memmap = make_memmap_file(path=get_output_filepath(args), num_examples=num_cells, tensor_names=['images'], tensor_formats=[cell_format]) num_copied_images = 0 for image_number, image_filepath in enumerate(image_filepaths): rows_slice = slice(image_number * cells_per_image, (image_number + 1) * cells_per_image) image = imread(image_filepath) memmap['images'][rows_slice, ...] = transform_image(image, args.scale, args.grid) num_copied_images += 1 print("copied from image {} of {}".format(num_copied_images, len(image_filepaths))) if not args.no_shuffle: print("shuffling {} images in-place on disk. This could take some " "time if the output lives on a spinning disk, rather than " "an SSD or RAM disk".format(memmap.shape[0])) rng = numpy.random.RandomState(425399) rng.shuffle(memmap) # shuffles in-place along just the first dimension print("... done shuffling.") print("Wrote {} images with shape {} to {}.".format(memmap.shape[0], cell_format.shape[1:], memmap.filename))