Exemplo n.º 1
0
    def start_refreshing_shared_dataset(
        self, shared_dataset_index, offset=None, dataset_index=None, transform=True, wait=False
    ):
        if offset is None or dataset_index is None:
            if self.make_dataset_offset is None:
                raise ValueError(
                    "Data loader wasn't given offset & which dataset to refresh, " "but can't make offsets itself."
                )
            dataset_index, offset = self.make_dataset_offset(self.datasets)
            message = "DataLoader decided to load dataset #{0} at offset {1}".format(dataset_index, offset)
            logger.debug(message)
        input_slice, output_slice = get_slices_from_dataset_offset(offset, self.input_shape, self.output_shape)
        dataset_metadata = dict(real=dataset_index, shared=shared_dataset_index, offset=offset)

        def pool_callback(return_value):
            return self.ready_shared_datasets.append(dataset_metadata)

        kwargs_for_refresh = dict(
            index_of_shared=shared_dataset_index,
            index_of_which_dataset=dataset_index,
            input_slice=input_slice,
            output_slice=output_slice,
            transform=transform,
            make_dataset_offset=self.make_dataset_offset,
        )
        async_result = self.pool.apply_async(
            func=execute_function,
            kwds=dict(function=update_shared_dataset, function_kwargs=kwargs_for_refresh),
            callback=pool_callback,
        )
        if wait:
            final_result = async_result.get()
            if final_result is not None:
                print(final_result)  # probably an error
        return shared_dataset_index, async_result
Exemplo n.º 2
0
def reopen_libdvid_voxelsaccessor_dataset(dataset):
    opened_dataset = dict(dataset)
    for key in dataset:
        dataset_value = dataset[key]
        if type(dataset_value) is VoxelsAccessor:
            hostname = dataset_value.hostname
            uuid = dataset_value.uuid
            data_name = dataset_value.data_name
            new_voxels_accessor = VoxelsAccessor(hostname, uuid, data_name)
            opened_dataset[key] = new_voxels_accessor
            logger.debug('opened {} at {} from {}'.format(data_name, uuid, hostname))
    yield opened_dataset
Exemplo n.º 3
0
def reopen_libdvid_voxelsaccessor_dataset(dataset):
    opened_dataset = dict(dataset)
    for key in dataset:
        dataset_value = dataset[key]
        if type(dataset_value) is VoxelsAccessor:
            hostname = dataset_value.hostname
            uuid = dataset_value.uuid
            data_name = dataset_value.data_name
            new_voxels_accessor = VoxelsAccessor(hostname, uuid, data_name)
            opened_dataset[key] = new_voxels_accessor
            logger.debug('opened {} at {} from {}'.format(
                data_name, uuid, hostname))
    yield opened_dataset
Exemplo n.º 4
0
def reopen_h5py_dataset(dataset):
    opened_dataset = dict(dataset)
    for key in dataset:
        dataset_value = dataset[key]
        if type(dataset_value) is h5py.Dataset:
            h5_file_path = dataset_value.file.filename
            h5_dataset_key = dataset_value.name
            array_view = get_array_view_of_hdf5_dataset(h5_file_path, h5_dataset_key)
            opened_dataset[key] = array_view
            logger.debug('opened {} in {}'.format(h5_dataset_key, h5_file_path))
    yield opened_dataset
    for key in opened_dataset:
        if type(opened_dataset[key]) is h5py.Dataset:
            logger.debug('closing {} in {}'.format(opened_dataset[key].name, opened_dataset[key].file.filename))
            opened_dataset[key].file.close()
Exemplo n.º 5
0
    def start_refreshing_shared_dataset(self,
                                        shared_dataset_index,
                                        offset=None,
                                        dataset_index=None,
                                        transform=True,
                                        wait=False):
        if offset is None or dataset_index is None:
            if self.make_dataset_offset is None:
                raise ValueError(
                    "Data loader wasn't given offset & which dataset to refresh, "
                    "but can't make offsets itself.")
            dataset_index, offset = self.make_dataset_offset(self.datasets)
            message = "DataLoader decided to load dataset #{0} at offset {1}"\
                .format(dataset_index, offset)
            logger.debug(message)
        input_slice, output_slice = get_slices_from_dataset_offset(
            offset, self.input_shape, self.output_shape)
        dataset_metadata = dict(real=dataset_index,
                                shared=shared_dataset_index,
                                offset=offset,
                                transform=transform)
        self.refreshes_in_progress.append(dataset_metadata)

        def pool_callback(return_value):
            self.refreshes_in_progress.remove(dataset_metadata)
            return self.ready_shared_datasets.append(dataset_metadata)

        kwargs_for_refresh = dict(
            index_of_shared=shared_dataset_index,
            index_of_which_dataset=dataset_index,
            input_slice=input_slice,
            output_slice=output_slice,
            transform=transform,
            make_dataset_offset=self.make_dataset_offset,
        )
        async_result = self.pool.apply_async(
            func=execute_function,
            kwds=dict(
                function=update_shared_dataset,
                function_kwargs=kwargs_for_refresh,
            ),
            callback=pool_callback)
        if wait:
            final_result = async_result.get()
            if final_result is not None:
                print(final_result)  # probably an error
        return shared_dataset_index, async_result
Exemplo n.º 6
0
def reopen_h5py_dataset(dataset):
    opened_dataset = dict(dataset)
    for key in dataset:
        dataset_value = dataset[key]
        if type(dataset_value) is h5py.Dataset:
            h5_file_path = dataset_value.file.filename
            h5_dataset_key = dataset_value.name
            array_view = get_array_view_of_hdf5_dataset(
                h5_file_path, h5_dataset_key)
            opened_dataset[key] = array_view
            logger.debug('opened {} in {}'.format(h5_dataset_key,
                                                  h5_file_path))
    yield opened_dataset
    for key in opened_dataset:
        if type(opened_dataset[key]) is h5py.Dataset:
            logger.debug('closing {} in {}'.format(
                opened_dataset[key].name, opened_dataset[key].file.filename))
            opened_dataset[key].file.close()
Exemplo n.º 7
0
 def __init__(self, size, datasets, input_shape, output_shape=None, n_workers=1, dataset_offset_func=None):
     self.size = size
     self.datasets = datasets
     self.input_shape = input_shape
     self.outputs_are_ignored = output_shape is None
     self.output_shape = output_shape or (0, 0, 0)
     self.make_dataset_offset = dataset_offset_func
     self._list = list()
     self.shapes = {
         "data": (1,) + self.input_shape,
         "components": (1,) + self.output_shape,
         "label": (3,) + self.output_shape,
         "mask": (1,) + self.output_shape,
     }
     self.dtypes = {"data": np.float32, "components": np.int32, "label": np.int32, "mask": np.uint8}
     self.keys_to_ignore = []
     if self.outputs_are_ignored:
         self.keys_to_ignore = ["label", "components", "mask"]
         for output_key in self.keys_to_ignore:
             self.dtypes.pop(output_key)
             self.shapes.pop(output_key)
     sizes = dict()
     for key, shape in self.shapes.iteritems():
         sizes[key] = reduce(mul, shape)
     logger.debug("sizes: {}".format(sizes))
     self.shared_datasets = []
     for n in range(size):
         shared_dataset = dict()
         for key in self.dtypes:
             size = sizes[key]
             dtype = self.dtypes[key]
             ctype = type(np.ctypeslib.as_ctypes(dtype(0)))
             message = "creating {key}'s multiprocessing.Array with ctype {c} " "and size {s}".format(
                 key=key, c=ctype, s=size
             )
             logger.debug(message)
             shared_dataset[key] = multiprocessing.Array(ctype, size, lock=False)
         self.shared_datasets.append(shared_dataset)
     self.pool = multiprocessing.Pool(
         processes=n_workers, initializer=self._initialize_pool, initargs=(), maxtasksperchild=1000
     )
     self.ready_shared_datasets = []
     return
Exemplo n.º 8
0
def simple_augment_minibatch(dataset_numpy):
    message = "before simple aug {}... \t{: <25}{}\t{: <25}{}\t{: <25}{}" \
        .format((0, 0, 0, 0),
                dataset_numpy["data"].shape, dataset_numpy["data"].mean(),
                dataset_numpy["components"].shape, dataset_numpy["components"].mean(),
                dataset_numpy["mask"].shape, dataset_numpy["mask"].mean())
    logger.debug(message)
    reflectx, reflecty, reflectz, swapxy = np.random.randint(low=0,
                                                             high=2,
                                                             size=4)
    dataset_numpy = reflect_and_swap_dataset(dataset_numpy, reflectx, reflecty,
                                             reflectz, swapxy)
    message = "after  simple aug {}... \t{: <25}{}\t{: <25}{}\t{: <25}{}" \
        .format((reflectx, reflecty, reflectz, swapxy),
                dataset_numpy["data"].shape, dataset_numpy["data"].mean(),
                dataset_numpy["components"].shape, dataset_numpy["components"].mean(),
                dataset_numpy["mask"].shape, dataset_numpy["mask"].mean())
    logger.debug(message)
    return dataset_numpy
Exemplo n.º 9
0
 def _initialize_shared_memory_arrays(self):
     shared_datasets = []
     sizes = dict()
     for key, shape in self.shapes.iteritems():
         sizes[key] = reduce(mul, shape)
     logger.debug("sizes: {}".format(sizes))
     for n in range(self.size):
         shared_dataset = dict()
         for key in self.dtypes:
             size = sizes[key]
             dtype = self.dtypes[key]
             ctype = type(np.ctypeslib.as_ctypes(dtype(0)))
             message = "creating {key}'s multiprocessing.Array with ctype {c} "\
                       "and size {s}".format(key=key, c=ctype, s=size)
             logger.debug(message)
             shared_dataset[key] = multiprocessing.Array(ctype,
                                                         size,
                                                         lock=False)
         shared_datasets.append(shared_dataset)
     return shared_datasets
Exemplo n.º 10
0
def update_shared_dataset(
    index_of_shared, index_of_which_dataset, input_slice, output_slice, transform=True, make_dataset_offset=None
):
    start_time = time.time()
    shared_dataset = shared_datasets[index_of_shared]
    dataset_is_ready = False
    while not dataset_is_ready:
        original_dataset = datasets[index_of_which_dataset]
        with reopen_dataset(original_dataset) as opened_dataset:
            dataset_numpy = get_numpy_dataset(opened_dataset, input_slice, output_slice, transform)
        if "mask" in dataset_numpy:
            mask_threshold = float(original_dataset.get("mask_threshold", 0))
            mask_fraction_of_this_batch = np.mean(dataset_numpy["mask"])
            if mask_fraction_of_this_batch <= mask_threshold:
                if make_dataset_offset is not None:
                    index_of_which_dataset, offset = make_dataset_offset(datasets)
                    input_shape = tuple([s.stop - s.start for s in input_slice])
                    if output_slice is not None:
                        output_shape = tuple([s.stop - s.start for s in output_slice])
                    else:
                        output_shape = None
                    input_slice, output_slice = get_slices_from_dataset_offset(offset, input_shape, output_shape)
                    message = "Skipping source dataset #{0} at output_slice {1} with mask {2}".format(
                        index_of_which_dataset, output_slice, "%06.4f" % mask_fraction_of_this_batch
                    )
                    logger.debug(message)
                else:
                    return (
                        "DataLoader worker encountered a 100% masked" "datachunk, but doesn't know how to replace it."
                    )
            else:
                message = "Using dataset #{0} at output_slice {1} with mask {2}".format(
                    index_of_which_dataset, output_slice, "%06.4f" % mask_fraction_of_this_batch
                )
                logger.debug(message)
                dataset_is_ready = True
        else:
            dataset_is_ready = True
    for key in shared_dataset:
        source_array = dataset_numpy[key].astype(dtypes[key])
        target_mp_array = shared_dataset[key]
        message = "storing dataset_numpy['{key}'] ({dt}, {shape})".format(
            key=key, dt=source_array.dtype, shape=source_array.shape
        )
        logger.debug(message)
        target_mp_array[:] = source_array.flatten()
    message = "Refreshing DataLoader dataset #{0} took {1}".format(
        index_of_shared, "%05.2fs" % (time.time() - start_time)
    )
    logger.debug(message)
    return
Exemplo n.º 11
0
def update_shared_dataset(index_of_shared,
                          index_of_which_dataset,
                          input_slice,
                          output_slice,
                          transform=True,
                          make_dataset_offset=None):
    start_time = time.time()
    shared_dataset = shared_datasets[index_of_shared]
    dataset_is_ready = False
    while not dataset_is_ready:
        original_dataset = datasets[index_of_which_dataset]
        with reopen_dataset(original_dataset) as opened_dataset:
            dataset_numpy = get_numpy_dataset(opened_dataset, input_slice,
                                              output_slice, transform)
        if dataset_numpy is None:
            message = "Skipping dataset #{0} at output_slice {1}"\
                .format(index_of_which_dataset, output_slice)
            logger.debug(message)
            if make_dataset_offset is not None:
                index_of_which_dataset, offset = make_dataset_offset(datasets)
                input_shape = tuple([s.stop - s.start for s in input_slice])
                if output_slice is not None:
                    output_shape = tuple(
                        [s.stop - s.start for s in output_slice])
                else:
                    output_shape = None
                input_slice, output_slice = get_slices_from_dataset_offset(
                    offset, input_shape, output_shape)
            else:
                return "DataLoader worker encountered a 100% masked" \
                       "datachunk, but doesn't know how to replace it."
        else:
            message = "Using dataset #{0} at output_slice {1}"\
                .format(index_of_which_dataset, output_slice)
            logger.debug(message)
            dataset_is_ready = True
    for key in shared_dataset:
        source_array = dataset_numpy[key].astype(dtypes[key])
        target_mp_array = shared_dataset[key]
        message = "storing dataset_numpy['{key}'] ({dt}, {shape})".format(
            key=key, dt=source_array.dtype, shape=source_array.shape)
        logger.debug(message)
        target_mp_array[:] = source_array.flatten()
    message = "Refreshing DataLoader dataset #{0} took {1}".format(
        index_of_shared, "%05.2fs" % (time.time() - start_time))
    logger.debug(message)
    return
Exemplo n.º 12
0
def get_numpy_dataset(original_dataset, input_slice, output_slice, transform):
    dataset_numpy = dict()
    n_spatial_dimensions = len(input_slice)
    image_slices = [slice(0, l) for l in original_dataset['data'].shape]
    image_slices[-n_spatial_dimensions:] = input_slice
    logger.debug("image_slices: {}".format(image_slices))
    source_image = get_zero_padded_array_slice(original_dataset['data'], image_slices)
    image = np.array(source_image, dtype=np.float32)
    image_scaling_factor = original_dataset.get('image_scaling_factor', None)
    if image_scaling_factor is None and source_image.dtype.kind in ('i', 'u'):  # integer, signed or unsigned
        image_scaling_factor = 0.5 ** 8
        message = """Data reader is scaling your image data by a factor of
                     1/256 because it's an integer data type and no scaling
                     factor was provided. If you don't like this default
                     behavior, then provide a dataset['image_scaling_factor']
                     key-value pair in your training dataset."""\
                     .format(isf=image_scaling_factor)
        warnings.warn(message)
    if image_scaling_factor is not None:
        if image_scaling_factor == 1.0:
            # congratulations, you have successfully prevented data scaling
            pass
        else:
            logger.debug("Scaling image by {isf}".format(isf=image_scaling_factor))
            image = np.multiply(image, image_scaling_factor)
    if transform:
        if 'transform' in original_dataset:
            lo, hi = original_dataset['transform']['scale']
            image = 0.5 + (image - 0.5) * np.random.uniform(low=lo, high=hi)
            lo, hi = original_dataset['transform']['shift']
            image = image + np.random.uniform(low=lo, high=hi)
        else:
            logger.debug("source data doesn't have 'transform' attribute.")
    if image.ndim == n_spatial_dimensions:
        new_shape = (1,) + image.shape
        image = image.reshape(new_shape)
    dataset_numpy['data'] = image
    # load outputs if desired
    if output_slice is not None:
        component_erosion_steps = original_dataset.get('component_erosion_steps', 0)
        dilation_amount = 1 + component_erosion_steps
        dilated_output_slices = tuple([slice(s.start - dilation_amount, s.stop + dilation_amount, s.step) for s in output_slice])
        components, affinities, mask = get_outputs(original_dataset, dilated_output_slices)
        de_dilation_slices = (Ellipsis,) + tuple([slice(dilation_amount, -dilation_amount) for _ in output_slice])
        dataset_numpy['components'] = components[de_dilation_slices]
        dataset_numpy['label'] = affinities[de_dilation_slices]
        dataset_numpy['mask'] = mask[de_dilation_slices]
    return dataset_numpy
Exemplo n.º 13
0
def get_outputs(original_dataset, output_slice):
    output_shape = tuple([slice_.stop - slice_.start for slice_ in output_slice])
    n_spatial_dimensions = len(output_slice)
    components_shape = (1,) + output_shape
    mask_shape = (1,) + output_shape
    affinities_shape = (n_spatial_dimensions,) + output_shape
    component_slices = [slice(0, l) for l in original_dataset['components'].shape]
    component_slices[-n_spatial_dimensions:] = output_slice
    logger.debug("component_slices: {}".format(component_slices))
    components_array = get_zero_padded_array_slice(original_dataset['components'], component_slices)
    source_class = type(original_dataset['components'])
    components_are_from_dvid = source_class in dvid_classes
    exclude_strings = original_dataset.get('body_names_to_exclude', [])
    if exclude_strings and components_are_from_dvid:
        dvid_uuid = original_dataset['components'].uuid
        components_to_keep = get_good_components(dvid_uuid, exclude_strings)
        logger.debug("components before: {}".format(list(np.unique(components_array))))
        components_array = replace_array_except_whitelist(components_array, 0, components_to_keep)
        logger.debug("components after: {}".format(list(np.unique(components_array))))
    minimum_component_size = original_dataset.get('minimum_component_size', 0)
    if minimum_component_size > 0:
        components_array = replace_infrequent_values(components_array, minimum_component_size, 0)
    component_erosion_steps = original_dataset.get('component_erosion_steps', 0)
    if component_erosion_steps > 0:
        components_array = erode_value_blobs(
            components_array,
            steps=component_erosion_steps,
            values_to_ignore=(0,))
    components_for_malis = components_array.reshape(output_shape)
    affinities_from_components = malis.seg_to_affgraph(
        components_for_malis,
        original_dataset['nhood'])
    components_array, _ = malis.connected_components_affgraph(
        affinities_from_components,
        original_dataset['nhood'])
    components_array = shift_up_component_values(components_array)
    components_array = components_array.reshape(components_shape)
    if 'label' in original_dataset:
        label_shape = original_dataset['label'].shape
        label_slices = [slice(0, l) for l in label_shape]
        label_slices[-n_spatial_dimensions:] = output_slice
        affinities_array = get_zero_padded_array_slice(original_dataset['label'], label_slices)
    else:
        # compute affinities from components
        logger.debug("Computing affinity labels from components because 'label' wasn't provided in data source.")
        affinities_array = affinities_from_components
    assert affinities_array.shape == affinities_shape, \
        "affinities_array.shape is {actual} but should be {desired}".format(
            actual=str(affinities_array.shape), desired=str(affinities_shape))
    if 'mask' in original_dataset:
        mask_array = get_zero_padded_array_slice(original_dataset['mask'], output_slice)
    else:
        if components_are_from_dvid:
            # infer mask values: 1 if component is nonzero, 0 otherwise
            mask_array = np.not_equal(components_array, 0)
            logger.debug("No mask provided. Setting to 1 where components != 0.")
        else:
            # assume no masking
            mask_array = np.ones_like(components_array, dtype=np.uint8)
            logger.debug("No mask provided. Setting to 1 where outputs exist.")
    mask_dilation_steps = original_dataset.get('mask_dilation_steps', 0)
    if mask_dilation_steps > 0:
        mask_array = ndimage.binary_dilation(mask_array, iterations=mask_dilation_steps)
    mask_array = mask_array.astype(np.uint8)
    mask_array = mask_array.reshape(mask_shape)
    return components_array, affinities_array, mask_array
Exemplo n.º 14
0
def get_outputs(original_dataset, output_slice):
    output_shape = tuple(
        [slice_.stop - slice_.start for slice_ in output_slice])
    n_spatial_dimensions = len(output_slice)
    components_shape = (1, ) + output_shape
    mask_shape = (1, ) + output_shape
    affinities_shape = (n_spatial_dimensions, ) + output_shape
    component_slices = [
        slice(0, l) for l in original_dataset['components'].shape
    ]
    component_slices[-n_spatial_dimensions:] = output_slice
    logger.debug("component_slices: {}".format(component_slices))
    components_array = get_zero_padded_array_slice(
        original_dataset['components'], component_slices)
    source_class = type(original_dataset['components'])
    components_are_from_dvid = source_class in dvid_classes
    exclude_strings = original_dataset.get('body_names_to_exclude', [])
    if exclude_strings and components_are_from_dvid:
        dvid_uuid = original_dataset['components'].uuid
        components_to_keep = get_good_components(dvid_uuid, exclude_strings)
        logger.debug("components before: {}".format(
            list(np.unique(components_array))))
        components_array = replace_array_except_whitelist(
            components_array, 0, components_to_keep)
        logger.debug("components after: {}".format(
            list(np.unique(components_array))))
    minimum_component_size = original_dataset.get('minimum_component_size', 0)
    if minimum_component_size > 0:
        components_array = replace_infrequent_values(components_array,
                                                     minimum_component_size, 0)
    component_erosion_steps = original_dataset.get('component_erosion_steps',
                                                   0)
    if component_erosion_steps > 0:
        components_array = erode_value_blobs(components_array,
                                             steps=component_erosion_steps,
                                             values_to_ignore=(0, ))
    components_for_malis = components_array.reshape(output_shape)
    affinities_from_components = malis.seg_to_affgraph(
        components_for_malis, original_dataset['nhood'])
    components_array, _ = malis.connected_components_affgraph(
        affinities_from_components, original_dataset['nhood'])
    components_array = shift_up_component_values(components_array)
    components_array = components_array.reshape(components_shape)
    if 'label' in original_dataset:
        label_shape = original_dataset['label'].shape
        label_slices = [slice(0, l) for l in label_shape]
        label_slices[-n_spatial_dimensions:] = output_slice
        affinities_array = get_zero_padded_array_slice(
            original_dataset['label'], label_slices)
    else:
        # compute affinities from components
        logger.debug(
            "Computing affinity labels from components because 'label' wasn't provided in data source."
        )
        affinities_array = affinities_from_components
    assert affinities_array.shape == affinities_shape, \
        "affinities_array.shape is {actual} but should be {desired}".format(
            actual=str(affinities_array.shape), desired=str(affinities_shape))
    if 'mask' in original_dataset:
        mask_array = get_zero_padded_array_slice(original_dataset['mask'],
                                                 output_slice)
    else:
        if components_are_from_dvid:
            # infer mask values: 1 if component is nonzero, 0 otherwise
            mask_array = np.not_equal(components_array, 0)
            logger.debug(
                "No mask provided. Setting to 1 where components != 0.")
        else:
            # assume no masking
            mask_array = np.ones_like(components_array, dtype=np.uint8)
            logger.debug("No mask provided. Setting to 1 where outputs exist.")
    mask_dilation_steps = original_dataset.get('mask_dilation_steps', 0)
    if mask_dilation_steps > 0:
        mask_array = ndimage.binary_dilation(mask_array,
                                             iterations=mask_dilation_steps)
    mask_array = mask_array.astype(np.uint8)
    mask_array = mask_array.reshape(mask_shape)
    return components_array, affinities_array, mask_array
Exemplo n.º 15
0
def get_numpy_dataset(original_dataset, input_slice, output_slice, transform):
    dataset_numpy = dict()
    dataset_numpy["name"] = "{}_at_input_{}_and_output_{}".format(
        original_dataset.get("name", "Untitled"), input_slice, output_slice)
    n_spatial_dimensions = len(input_slice)
    image_slices = [slice(0, l) for l in original_dataset['data'].shape]
    image_slices[-n_spatial_dimensions:] = input_slice
    logger.debug("image_slices: {}".format(image_slices))
    image_is_zero_padded = original_dataset.get("image_is_zero_padded", False)
    if image_is_zero_padded:
        source_image = original_dataset["data"][image_slices]
    else:
        source_image = get_zero_padded_array_slice(original_dataset['data'],
                                                   image_slices)
    image = np.array(source_image, dtype=np.float32)
    image_scaling_factor = original_dataset.get('image_scaling_factor', None)
    if image_scaling_factor is None and source_image.dtype.kind in (
            'i', 'u'):  # integer, signed or unsigned
        image_scaling_factor = 0.5**8
        message = """Data reader is scaling your image data by a factor of
                     1/256 because it's an integer data type and no scaling
                     factor was provided. If you don't like this default
                     behavior, then provide a dataset['image_scaling_factor']
                     key-value pair in your training dataset."""\
                     .format(isf=image_scaling_factor)
        warnings.warn(message)
    if image_scaling_factor is not None:
        if image_scaling_factor == 1.0:
            # congratulations, you have successfully prevented data scaling
            pass
        else:
            logger.debug(
                "Scaling image by {isf}".format(isf=image_scaling_factor))
            image = np.multiply(image, image_scaling_factor)
    if transform:
        if 'transform' in original_dataset:
            lo, hi = original_dataset['transform']['scale']
            image = 0.5 + (image - 0.5) * np.random.uniform(low=lo, high=hi)
            lo, hi = original_dataset['transform']['shift']
            image = image + np.random.uniform(low=lo, high=hi)
        else:
            logger.debug("source data doesn't have 'transform' attribute.")
    if image.ndim == n_spatial_dimensions:
        new_shape = (1, ) + image.shape
        image = image.reshape(new_shape)
    # load outputs if desired
    if output_slice is not None:
        component_erosion_steps = original_dataset.get(
            'component_erosion_steps', 0)
        dilation_amount = 1 + component_erosion_steps
        dilated_output_slices = tuple(
            slice(s.start - dilation_amount, s.stop + dilation_amount, s.step)
            for s in output_slice)
        de_dilation_slices = (Ellipsis, ) + tuple(
            slice(dilation_amount, -dilation_amount) for _ in output_slice)
        components, affinities, mask = get_outputs(original_dataset,
                                                   dilated_output_slices)
        mask_threshold = float(original_dataset.get('mask_threshold', 0))
        mask_fraction_of_this_batch = np.mean(mask[de_dilation_slices])
        good_enough = mask_fraction_of_this_batch > mask_threshold
        if not good_enough:
            return None
        simple_augment = original_dataset.get("simple_augment", False)
        if simple_augment:
            dataset_to_augment = dict(name=dataset_numpy["name"],
                                      data=image,
                                      components=components,
                                      mask=mask,
                                      nhood=original_dataset['nhood'])
            augmented_dilated_dataset = simple_augment_minibatch(
                dataset_to_augment)
            components = augmented_dilated_dataset["components"]
            affinities = augmented_dilated_dataset["label"]
            mask = augmented_dilated_dataset["mask"]
            image = augmented_dilated_dataset["data"]
        dataset_numpy['components'] = components[de_dilation_slices]
        dataset_numpy['label'] = affinities[de_dilation_slices]
        dataset_numpy['mask'] = mask[de_dilation_slices]
        dataset_numpy['nhood'] = original_dataset['nhood']
    dataset_numpy['data'] = image
    return dataset_numpy