def start_refreshing_shared_dataset( self, shared_dataset_index, offset=None, dataset_index=None, transform=True, wait=False ): if offset is None or dataset_index is None: if self.make_dataset_offset is None: raise ValueError( "Data loader wasn't given offset & which dataset to refresh, " "but can't make offsets itself." ) dataset_index, offset = self.make_dataset_offset(self.datasets) message = "DataLoader decided to load dataset #{0} at offset {1}".format(dataset_index, offset) logger.debug(message) input_slice, output_slice = get_slices_from_dataset_offset(offset, self.input_shape, self.output_shape) dataset_metadata = dict(real=dataset_index, shared=shared_dataset_index, offset=offset) def pool_callback(return_value): return self.ready_shared_datasets.append(dataset_metadata) kwargs_for_refresh = dict( index_of_shared=shared_dataset_index, index_of_which_dataset=dataset_index, input_slice=input_slice, output_slice=output_slice, transform=transform, make_dataset_offset=self.make_dataset_offset, ) async_result = self.pool.apply_async( func=execute_function, kwds=dict(function=update_shared_dataset, function_kwargs=kwargs_for_refresh), callback=pool_callback, ) if wait: final_result = async_result.get() if final_result is not None: print(final_result) # probably an error return shared_dataset_index, async_result
def reopen_libdvid_voxelsaccessor_dataset(dataset): opened_dataset = dict(dataset) for key in dataset: dataset_value = dataset[key] if type(dataset_value) is VoxelsAccessor: hostname = dataset_value.hostname uuid = dataset_value.uuid data_name = dataset_value.data_name new_voxels_accessor = VoxelsAccessor(hostname, uuid, data_name) opened_dataset[key] = new_voxels_accessor logger.debug('opened {} at {} from {}'.format(data_name, uuid, hostname)) yield opened_dataset
def reopen_libdvid_voxelsaccessor_dataset(dataset): opened_dataset = dict(dataset) for key in dataset: dataset_value = dataset[key] if type(dataset_value) is VoxelsAccessor: hostname = dataset_value.hostname uuid = dataset_value.uuid data_name = dataset_value.data_name new_voxels_accessor = VoxelsAccessor(hostname, uuid, data_name) opened_dataset[key] = new_voxels_accessor logger.debug('opened {} at {} from {}'.format( data_name, uuid, hostname)) yield opened_dataset
def reopen_h5py_dataset(dataset): opened_dataset = dict(dataset) for key in dataset: dataset_value = dataset[key] if type(dataset_value) is h5py.Dataset: h5_file_path = dataset_value.file.filename h5_dataset_key = dataset_value.name array_view = get_array_view_of_hdf5_dataset(h5_file_path, h5_dataset_key) opened_dataset[key] = array_view logger.debug('opened {} in {}'.format(h5_dataset_key, h5_file_path)) yield opened_dataset for key in opened_dataset: if type(opened_dataset[key]) is h5py.Dataset: logger.debug('closing {} in {}'.format(opened_dataset[key].name, opened_dataset[key].file.filename)) opened_dataset[key].file.close()
def start_refreshing_shared_dataset(self, shared_dataset_index, offset=None, dataset_index=None, transform=True, wait=False): if offset is None or dataset_index is None: if self.make_dataset_offset is None: raise ValueError( "Data loader wasn't given offset & which dataset to refresh, " "but can't make offsets itself.") dataset_index, offset = self.make_dataset_offset(self.datasets) message = "DataLoader decided to load dataset #{0} at offset {1}"\ .format(dataset_index, offset) logger.debug(message) input_slice, output_slice = get_slices_from_dataset_offset( offset, self.input_shape, self.output_shape) dataset_metadata = dict(real=dataset_index, shared=shared_dataset_index, offset=offset, transform=transform) self.refreshes_in_progress.append(dataset_metadata) def pool_callback(return_value): self.refreshes_in_progress.remove(dataset_metadata) return self.ready_shared_datasets.append(dataset_metadata) kwargs_for_refresh = dict( index_of_shared=shared_dataset_index, index_of_which_dataset=dataset_index, input_slice=input_slice, output_slice=output_slice, transform=transform, make_dataset_offset=self.make_dataset_offset, ) async_result = self.pool.apply_async( func=execute_function, kwds=dict( function=update_shared_dataset, function_kwargs=kwargs_for_refresh, ), callback=pool_callback) if wait: final_result = async_result.get() if final_result is not None: print(final_result) # probably an error return shared_dataset_index, async_result
def reopen_h5py_dataset(dataset): opened_dataset = dict(dataset) for key in dataset: dataset_value = dataset[key] if type(dataset_value) is h5py.Dataset: h5_file_path = dataset_value.file.filename h5_dataset_key = dataset_value.name array_view = get_array_view_of_hdf5_dataset( h5_file_path, h5_dataset_key) opened_dataset[key] = array_view logger.debug('opened {} in {}'.format(h5_dataset_key, h5_file_path)) yield opened_dataset for key in opened_dataset: if type(opened_dataset[key]) is h5py.Dataset: logger.debug('closing {} in {}'.format( opened_dataset[key].name, opened_dataset[key].file.filename)) opened_dataset[key].file.close()
def __init__(self, size, datasets, input_shape, output_shape=None, n_workers=1, dataset_offset_func=None): self.size = size self.datasets = datasets self.input_shape = input_shape self.outputs_are_ignored = output_shape is None self.output_shape = output_shape or (0, 0, 0) self.make_dataset_offset = dataset_offset_func self._list = list() self.shapes = { "data": (1,) + self.input_shape, "components": (1,) + self.output_shape, "label": (3,) + self.output_shape, "mask": (1,) + self.output_shape, } self.dtypes = {"data": np.float32, "components": np.int32, "label": np.int32, "mask": np.uint8} self.keys_to_ignore = [] if self.outputs_are_ignored: self.keys_to_ignore = ["label", "components", "mask"] for output_key in self.keys_to_ignore: self.dtypes.pop(output_key) self.shapes.pop(output_key) sizes = dict() for key, shape in self.shapes.iteritems(): sizes[key] = reduce(mul, shape) logger.debug("sizes: {}".format(sizes)) self.shared_datasets = [] for n in range(size): shared_dataset = dict() for key in self.dtypes: size = sizes[key] dtype = self.dtypes[key] ctype = type(np.ctypeslib.as_ctypes(dtype(0))) message = "creating {key}'s multiprocessing.Array with ctype {c} " "and size {s}".format( key=key, c=ctype, s=size ) logger.debug(message) shared_dataset[key] = multiprocessing.Array(ctype, size, lock=False) self.shared_datasets.append(shared_dataset) self.pool = multiprocessing.Pool( processes=n_workers, initializer=self._initialize_pool, initargs=(), maxtasksperchild=1000 ) self.ready_shared_datasets = [] return
def simple_augment_minibatch(dataset_numpy): message = "before simple aug {}... \t{: <25}{}\t{: <25}{}\t{: <25}{}" \ .format((0, 0, 0, 0), dataset_numpy["data"].shape, dataset_numpy["data"].mean(), dataset_numpy["components"].shape, dataset_numpy["components"].mean(), dataset_numpy["mask"].shape, dataset_numpy["mask"].mean()) logger.debug(message) reflectx, reflecty, reflectz, swapxy = np.random.randint(low=0, high=2, size=4) dataset_numpy = reflect_and_swap_dataset(dataset_numpy, reflectx, reflecty, reflectz, swapxy) message = "after simple aug {}... \t{: <25}{}\t{: <25}{}\t{: <25}{}" \ .format((reflectx, reflecty, reflectz, swapxy), dataset_numpy["data"].shape, dataset_numpy["data"].mean(), dataset_numpy["components"].shape, dataset_numpy["components"].mean(), dataset_numpy["mask"].shape, dataset_numpy["mask"].mean()) logger.debug(message) return dataset_numpy
def _initialize_shared_memory_arrays(self): shared_datasets = [] sizes = dict() for key, shape in self.shapes.iteritems(): sizes[key] = reduce(mul, shape) logger.debug("sizes: {}".format(sizes)) for n in range(self.size): shared_dataset = dict() for key in self.dtypes: size = sizes[key] dtype = self.dtypes[key] ctype = type(np.ctypeslib.as_ctypes(dtype(0))) message = "creating {key}'s multiprocessing.Array with ctype {c} "\ "and size {s}".format(key=key, c=ctype, s=size) logger.debug(message) shared_dataset[key] = multiprocessing.Array(ctype, size, lock=False) shared_datasets.append(shared_dataset) return shared_datasets
def update_shared_dataset( index_of_shared, index_of_which_dataset, input_slice, output_slice, transform=True, make_dataset_offset=None ): start_time = time.time() shared_dataset = shared_datasets[index_of_shared] dataset_is_ready = False while not dataset_is_ready: original_dataset = datasets[index_of_which_dataset] with reopen_dataset(original_dataset) as opened_dataset: dataset_numpy = get_numpy_dataset(opened_dataset, input_slice, output_slice, transform) if "mask" in dataset_numpy: mask_threshold = float(original_dataset.get("mask_threshold", 0)) mask_fraction_of_this_batch = np.mean(dataset_numpy["mask"]) if mask_fraction_of_this_batch <= mask_threshold: if make_dataset_offset is not None: index_of_which_dataset, offset = make_dataset_offset(datasets) input_shape = tuple([s.stop - s.start for s in input_slice]) if output_slice is not None: output_shape = tuple([s.stop - s.start for s in output_slice]) else: output_shape = None input_slice, output_slice = get_slices_from_dataset_offset(offset, input_shape, output_shape) message = "Skipping source dataset #{0} at output_slice {1} with mask {2}".format( index_of_which_dataset, output_slice, "%06.4f" % mask_fraction_of_this_batch ) logger.debug(message) else: return ( "DataLoader worker encountered a 100% masked" "datachunk, but doesn't know how to replace it." ) else: message = "Using dataset #{0} at output_slice {1} with mask {2}".format( index_of_which_dataset, output_slice, "%06.4f" % mask_fraction_of_this_batch ) logger.debug(message) dataset_is_ready = True else: dataset_is_ready = True for key in shared_dataset: source_array = dataset_numpy[key].astype(dtypes[key]) target_mp_array = shared_dataset[key] message = "storing dataset_numpy['{key}'] ({dt}, {shape})".format( key=key, dt=source_array.dtype, shape=source_array.shape ) logger.debug(message) target_mp_array[:] = source_array.flatten() message = "Refreshing DataLoader dataset #{0} took {1}".format( index_of_shared, "%05.2fs" % (time.time() - start_time) ) logger.debug(message) return
def update_shared_dataset(index_of_shared, index_of_which_dataset, input_slice, output_slice, transform=True, make_dataset_offset=None): start_time = time.time() shared_dataset = shared_datasets[index_of_shared] dataset_is_ready = False while not dataset_is_ready: original_dataset = datasets[index_of_which_dataset] with reopen_dataset(original_dataset) as opened_dataset: dataset_numpy = get_numpy_dataset(opened_dataset, input_slice, output_slice, transform) if dataset_numpy is None: message = "Skipping dataset #{0} at output_slice {1}"\ .format(index_of_which_dataset, output_slice) logger.debug(message) if make_dataset_offset is not None: index_of_which_dataset, offset = make_dataset_offset(datasets) input_shape = tuple([s.stop - s.start for s in input_slice]) if output_slice is not None: output_shape = tuple( [s.stop - s.start for s in output_slice]) else: output_shape = None input_slice, output_slice = get_slices_from_dataset_offset( offset, input_shape, output_shape) else: return "DataLoader worker encountered a 100% masked" \ "datachunk, but doesn't know how to replace it." else: message = "Using dataset #{0} at output_slice {1}"\ .format(index_of_which_dataset, output_slice) logger.debug(message) dataset_is_ready = True for key in shared_dataset: source_array = dataset_numpy[key].astype(dtypes[key]) target_mp_array = shared_dataset[key] message = "storing dataset_numpy['{key}'] ({dt}, {shape})".format( key=key, dt=source_array.dtype, shape=source_array.shape) logger.debug(message) target_mp_array[:] = source_array.flatten() message = "Refreshing DataLoader dataset #{0} took {1}".format( index_of_shared, "%05.2fs" % (time.time() - start_time)) logger.debug(message) return
def get_numpy_dataset(original_dataset, input_slice, output_slice, transform): dataset_numpy = dict() n_spatial_dimensions = len(input_slice) image_slices = [slice(0, l) for l in original_dataset['data'].shape] image_slices[-n_spatial_dimensions:] = input_slice logger.debug("image_slices: {}".format(image_slices)) source_image = get_zero_padded_array_slice(original_dataset['data'], image_slices) image = np.array(source_image, dtype=np.float32) image_scaling_factor = original_dataset.get('image_scaling_factor', None) if image_scaling_factor is None and source_image.dtype.kind in ('i', 'u'): # integer, signed or unsigned image_scaling_factor = 0.5 ** 8 message = """Data reader is scaling your image data by a factor of 1/256 because it's an integer data type and no scaling factor was provided. If you don't like this default behavior, then provide a dataset['image_scaling_factor'] key-value pair in your training dataset."""\ .format(isf=image_scaling_factor) warnings.warn(message) if image_scaling_factor is not None: if image_scaling_factor == 1.0: # congratulations, you have successfully prevented data scaling pass else: logger.debug("Scaling image by {isf}".format(isf=image_scaling_factor)) image = np.multiply(image, image_scaling_factor) if transform: if 'transform' in original_dataset: lo, hi = original_dataset['transform']['scale'] image = 0.5 + (image - 0.5) * np.random.uniform(low=lo, high=hi) lo, hi = original_dataset['transform']['shift'] image = image + np.random.uniform(low=lo, high=hi) else: logger.debug("source data doesn't have 'transform' attribute.") if image.ndim == n_spatial_dimensions: new_shape = (1,) + image.shape image = image.reshape(new_shape) dataset_numpy['data'] = image # load outputs if desired if output_slice is not None: component_erosion_steps = original_dataset.get('component_erosion_steps', 0) dilation_amount = 1 + component_erosion_steps dilated_output_slices = tuple([slice(s.start - dilation_amount, s.stop + dilation_amount, s.step) for s in output_slice]) components, affinities, mask = get_outputs(original_dataset, dilated_output_slices) de_dilation_slices = (Ellipsis,) + tuple([slice(dilation_amount, -dilation_amount) for _ in output_slice]) dataset_numpy['components'] = components[de_dilation_slices] dataset_numpy['label'] = affinities[de_dilation_slices] dataset_numpy['mask'] = mask[de_dilation_slices] return dataset_numpy
def get_outputs(original_dataset, output_slice): output_shape = tuple([slice_.stop - slice_.start for slice_ in output_slice]) n_spatial_dimensions = len(output_slice) components_shape = (1,) + output_shape mask_shape = (1,) + output_shape affinities_shape = (n_spatial_dimensions,) + output_shape component_slices = [slice(0, l) for l in original_dataset['components'].shape] component_slices[-n_spatial_dimensions:] = output_slice logger.debug("component_slices: {}".format(component_slices)) components_array = get_zero_padded_array_slice(original_dataset['components'], component_slices) source_class = type(original_dataset['components']) components_are_from_dvid = source_class in dvid_classes exclude_strings = original_dataset.get('body_names_to_exclude', []) if exclude_strings and components_are_from_dvid: dvid_uuid = original_dataset['components'].uuid components_to_keep = get_good_components(dvid_uuid, exclude_strings) logger.debug("components before: {}".format(list(np.unique(components_array)))) components_array = replace_array_except_whitelist(components_array, 0, components_to_keep) logger.debug("components after: {}".format(list(np.unique(components_array)))) minimum_component_size = original_dataset.get('minimum_component_size', 0) if minimum_component_size > 0: components_array = replace_infrequent_values(components_array, minimum_component_size, 0) component_erosion_steps = original_dataset.get('component_erosion_steps', 0) if component_erosion_steps > 0: components_array = erode_value_blobs( components_array, steps=component_erosion_steps, values_to_ignore=(0,)) components_for_malis = components_array.reshape(output_shape) affinities_from_components = malis.seg_to_affgraph( components_for_malis, original_dataset['nhood']) components_array, _ = malis.connected_components_affgraph( affinities_from_components, original_dataset['nhood']) components_array = shift_up_component_values(components_array) components_array = components_array.reshape(components_shape) if 'label' in original_dataset: label_shape = original_dataset['label'].shape label_slices = [slice(0, l) for l in label_shape] label_slices[-n_spatial_dimensions:] = output_slice affinities_array = get_zero_padded_array_slice(original_dataset['label'], label_slices) else: # compute affinities from components logger.debug("Computing affinity labels from components because 'label' wasn't provided in data source.") affinities_array = affinities_from_components assert affinities_array.shape == affinities_shape, \ "affinities_array.shape is {actual} but should be {desired}".format( actual=str(affinities_array.shape), desired=str(affinities_shape)) if 'mask' in original_dataset: mask_array = get_zero_padded_array_slice(original_dataset['mask'], output_slice) else: if components_are_from_dvid: # infer mask values: 1 if component is nonzero, 0 otherwise mask_array = np.not_equal(components_array, 0) logger.debug("No mask provided. Setting to 1 where components != 0.") else: # assume no masking mask_array = np.ones_like(components_array, dtype=np.uint8) logger.debug("No mask provided. Setting to 1 where outputs exist.") mask_dilation_steps = original_dataset.get('mask_dilation_steps', 0) if mask_dilation_steps > 0: mask_array = ndimage.binary_dilation(mask_array, iterations=mask_dilation_steps) mask_array = mask_array.astype(np.uint8) mask_array = mask_array.reshape(mask_shape) return components_array, affinities_array, mask_array
def get_outputs(original_dataset, output_slice): output_shape = tuple( [slice_.stop - slice_.start for slice_ in output_slice]) n_spatial_dimensions = len(output_slice) components_shape = (1, ) + output_shape mask_shape = (1, ) + output_shape affinities_shape = (n_spatial_dimensions, ) + output_shape component_slices = [ slice(0, l) for l in original_dataset['components'].shape ] component_slices[-n_spatial_dimensions:] = output_slice logger.debug("component_slices: {}".format(component_slices)) components_array = get_zero_padded_array_slice( original_dataset['components'], component_slices) source_class = type(original_dataset['components']) components_are_from_dvid = source_class in dvid_classes exclude_strings = original_dataset.get('body_names_to_exclude', []) if exclude_strings and components_are_from_dvid: dvid_uuid = original_dataset['components'].uuid components_to_keep = get_good_components(dvid_uuid, exclude_strings) logger.debug("components before: {}".format( list(np.unique(components_array)))) components_array = replace_array_except_whitelist( components_array, 0, components_to_keep) logger.debug("components after: {}".format( list(np.unique(components_array)))) minimum_component_size = original_dataset.get('minimum_component_size', 0) if minimum_component_size > 0: components_array = replace_infrequent_values(components_array, minimum_component_size, 0) component_erosion_steps = original_dataset.get('component_erosion_steps', 0) if component_erosion_steps > 0: components_array = erode_value_blobs(components_array, steps=component_erosion_steps, values_to_ignore=(0, )) components_for_malis = components_array.reshape(output_shape) affinities_from_components = malis.seg_to_affgraph( components_for_malis, original_dataset['nhood']) components_array, _ = malis.connected_components_affgraph( affinities_from_components, original_dataset['nhood']) components_array = shift_up_component_values(components_array) components_array = components_array.reshape(components_shape) if 'label' in original_dataset: label_shape = original_dataset['label'].shape label_slices = [slice(0, l) for l in label_shape] label_slices[-n_spatial_dimensions:] = output_slice affinities_array = get_zero_padded_array_slice( original_dataset['label'], label_slices) else: # compute affinities from components logger.debug( "Computing affinity labels from components because 'label' wasn't provided in data source." ) affinities_array = affinities_from_components assert affinities_array.shape == affinities_shape, \ "affinities_array.shape is {actual} but should be {desired}".format( actual=str(affinities_array.shape), desired=str(affinities_shape)) if 'mask' in original_dataset: mask_array = get_zero_padded_array_slice(original_dataset['mask'], output_slice) else: if components_are_from_dvid: # infer mask values: 1 if component is nonzero, 0 otherwise mask_array = np.not_equal(components_array, 0) logger.debug( "No mask provided. Setting to 1 where components != 0.") else: # assume no masking mask_array = np.ones_like(components_array, dtype=np.uint8) logger.debug("No mask provided. Setting to 1 where outputs exist.") mask_dilation_steps = original_dataset.get('mask_dilation_steps', 0) if mask_dilation_steps > 0: mask_array = ndimage.binary_dilation(mask_array, iterations=mask_dilation_steps) mask_array = mask_array.astype(np.uint8) mask_array = mask_array.reshape(mask_shape) return components_array, affinities_array, mask_array
def get_numpy_dataset(original_dataset, input_slice, output_slice, transform): dataset_numpy = dict() dataset_numpy["name"] = "{}_at_input_{}_and_output_{}".format( original_dataset.get("name", "Untitled"), input_slice, output_slice) n_spatial_dimensions = len(input_slice) image_slices = [slice(0, l) for l in original_dataset['data'].shape] image_slices[-n_spatial_dimensions:] = input_slice logger.debug("image_slices: {}".format(image_slices)) image_is_zero_padded = original_dataset.get("image_is_zero_padded", False) if image_is_zero_padded: source_image = original_dataset["data"][image_slices] else: source_image = get_zero_padded_array_slice(original_dataset['data'], image_slices) image = np.array(source_image, dtype=np.float32) image_scaling_factor = original_dataset.get('image_scaling_factor', None) if image_scaling_factor is None and source_image.dtype.kind in ( 'i', 'u'): # integer, signed or unsigned image_scaling_factor = 0.5**8 message = """Data reader is scaling your image data by a factor of 1/256 because it's an integer data type and no scaling factor was provided. If you don't like this default behavior, then provide a dataset['image_scaling_factor'] key-value pair in your training dataset."""\ .format(isf=image_scaling_factor) warnings.warn(message) if image_scaling_factor is not None: if image_scaling_factor == 1.0: # congratulations, you have successfully prevented data scaling pass else: logger.debug( "Scaling image by {isf}".format(isf=image_scaling_factor)) image = np.multiply(image, image_scaling_factor) if transform: if 'transform' in original_dataset: lo, hi = original_dataset['transform']['scale'] image = 0.5 + (image - 0.5) * np.random.uniform(low=lo, high=hi) lo, hi = original_dataset['transform']['shift'] image = image + np.random.uniform(low=lo, high=hi) else: logger.debug("source data doesn't have 'transform' attribute.") if image.ndim == n_spatial_dimensions: new_shape = (1, ) + image.shape image = image.reshape(new_shape) # load outputs if desired if output_slice is not None: component_erosion_steps = original_dataset.get( 'component_erosion_steps', 0) dilation_amount = 1 + component_erosion_steps dilated_output_slices = tuple( slice(s.start - dilation_amount, s.stop + dilation_amount, s.step) for s in output_slice) de_dilation_slices = (Ellipsis, ) + tuple( slice(dilation_amount, -dilation_amount) for _ in output_slice) components, affinities, mask = get_outputs(original_dataset, dilated_output_slices) mask_threshold = float(original_dataset.get('mask_threshold', 0)) mask_fraction_of_this_batch = np.mean(mask[de_dilation_slices]) good_enough = mask_fraction_of_this_batch > mask_threshold if not good_enough: return None simple_augment = original_dataset.get("simple_augment", False) if simple_augment: dataset_to_augment = dict(name=dataset_numpy["name"], data=image, components=components, mask=mask, nhood=original_dataset['nhood']) augmented_dilated_dataset = simple_augment_minibatch( dataset_to_augment) components = augmented_dilated_dataset["components"] affinities = augmented_dilated_dataset["label"] mask = augmented_dilated_dataset["mask"] image = augmented_dilated_dataset["data"] dataset_numpy['components'] = components[de_dilation_slices] dataset_numpy['label'] = affinities[de_dilation_slices] dataset_numpy['mask'] = mask[de_dilation_slices] dataset_numpy['nhood'] = original_dataset['nhood'] dataset_numpy['data'] = image return dataset_numpy