class TiledVolume(object): """ Given a directory of image tiles that make up a volume, produces numpy array volumes for arbitrary roi requests. """ #: These fields describe the schema of the description file. #: See the source code comments for a description of each field. DescriptionFields = \ { "_schema_name" : "tiled-volume-description", "_schema_version" : 1.0, "name" : str, "format" : str, "dtype" : AutoEval(), "bounds_zyx" : AutoEval(numpy.array), # Maximum coordinates (+1) "view_origin_zyx" : AutoEval(numpy.array), # Optional offset for output 'view' "view_shape_zyx" : AutoEval(numpy.array), # Shape of the output 'view'. If not provided, defaults to bounds - origin "resolution_zyx" : AutoEval(numpy.array), "tile_shape_2d_yx" : AutoEval(numpy.array), "is_rgb" : bool, # Indicates that we must convert to grayscale "username" : str, "password" : str, # This doesn't change how the data is read from the server, # but instead specifies the indexing order of the numpy volumes produced. "output_axes" : str, "cache_tiles" : bool, # Offset not supported for now... #"origin_offset" : AutoEval(numpy.array), # For now we support 3D-only, sliced across Z (TODO: Support 5D?) # We allow multiple url schemes: tiles might be addressed via pixel coordinates or row/column indexing # (z_index and z_start are synonyms here -- either is allowed) # Example: pixel-wise tile names: # "tile_url_format" : "http://my.tiles.org/my_tiles/{z_start}-{z_stop}/{y_start}-{y_stop}/{x_start}-{x_stop}.jpg" # Example: row/column-wise tile names # "tile_url_format" : "http://my.tiles.org/my_tiles/{z_index}/{y_index}/{x_index}.jpg" # Also, local tile sources (filesystem, not http) are okay: # "tile_url_format" : "/my_hard_disk/my_tiles/{z_index}/{y_index}/{x_index}.jpg" "tile_url_format" : FormattedField( requiredFields=[], optionalFields=["x_start", "y_start", "z_start", "x_stop", "y_stop", "z_stop", "x_index", "y_index", "z_index", "raveler_z_base"] ), # Special keyword for Raveler session directories. See notes below. "invert_y_axis" : bool, # For raveler volumes, the y-axis coordinate is inverted. # A list of lists, mapping src slices to destination slices (for "filling in" missing slices) # Example If slices 101,102,103 are missing data, you might want to simply repeat the data from slice 100: # "extend_slices" : [ [100, [101, 102, 103]] ] "extend_slices" : list, # Some tiled volumes have complicated mappings from "real" or "global" coordinates to url/filepath coordinates. # This field will be eval()'d before the tile is retrieved # For example, if the slices were named according to their position in nanometers instead of pixels, this might do the trick: # "z_translation_function" : "lambda z: 40*z" "z_translation_function" : str, # Optional data transform. For example: # "data_transform_function" : "lambda a: a == 0", "data_transform_function" : str } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): # Read file description = TiledVolume.DescriptionSchema.parseConfigFile( descriptionFilePath) cls.updateDescription(description) return description @classmethod def updateDescription(cls, description): """ Some description fields are optional. If they aren't provided in the description JSON file, then this function provides them with default values, based on the other description fields. """ # Augment with default parameters. logger.debug(str(description)) if description.view_origin_zyx is None: description.view_origin_zyx = numpy.array( [0] * len(description.bounds_zyx)) if description.view_shape_zyx is None: description.view_shape_zyx = description.bounds_zyx - description.view_origin_zyx if not description.output_axes: description.output_axes = "zyx" assert description.output_axes is None or set(description.output_axes) == set("zyx"), \ "Axis order must include x,y,z (and nothing else)" if not description.extend_slices: description.extend_slices = [] if description.cache_tiles is None: description.cache_tiles = False def __init__(self, descriptionFilePath): self.description = TiledVolume.readDescription(descriptionFilePath) self._session = None assert self.description.format in vigra.impex.listExtensions().split(), \ "Unknown tile format: {}".format( self.description.format ) assert self.description.tile_shape_2d_yx.shape == (2, ) assert self.description.bounds_zyx.shape == (3, ) assert self.description.view_shape_zyx.shape == (3, ) shape_dict = dict(zip('zyx', self.description.view_shape_zyx)) self.output_shape = tuple(shape_dict[k] for k in self.description.output_axes) self._slice_remapping = {} for source, destinations in self.description.extend_slices: for dest in destinations: self._slice_remapping[dest] = source def close(self): if self._session: self._session.close() def read(self, view_roi, result_out): """ roi: (start, stop) tuples, ordered according to description.output_axes roi should be relative to the view """ output_axes = self.description.output_axes roi_transposed = zip(*view_roi) roi_dict = dict(zip(output_axes, roi_transposed)) view_roi = zip(*(roi_dict['z'], roi_dict['y'], roi_dict['x'])) # First, normalize roi and result to zyx order result_out = vigra.taggedView(result_out, output_axes) result_out = result_out.withAxes(*'zyx') assert numpy.array(view_roi).shape == ( 2, 3), "Invalid roi for 3D volume: {}".format(view_roi) view_roi = numpy.array(view_roi) assert (result_out.shape == (view_roi[1] - view_roi[0])).all() # User gave roi according to the view output. # Now offset it find global roi. roi = view_roi + self.description.view_origin_zyx tile_blockshape = (1, ) + tuple(self.description.tile_shape_2d_yx) tile_starts = getIntersectingBlocks(tile_blockshape, roi) pool = RequestPool() for tile_start in tile_starts: tile_roi_in = getBlockBounds(self.description.bounds_zyx, tile_blockshape, tile_start) tile_roi_in = numpy.array(tile_roi_in) # This tile's portion of the roi intersecting_roi = getIntersection(roi, tile_roi_in) intersecting_roi = numpy.array(intersecting_roi) # Compute slicing within destination array and slicing within this tile destination_relative_intersection = numpy.subtract( intersecting_roi, roi[0]) tile_relative_intersection = intersecting_roi - tile_roi_in[0] # Get a view to the output slice result_region = result_out[roiToSlice( *destination_relative_intersection)] rest_args = self._get_rest_args(tile_blockshape, tile_roi_in) if self.description.tile_url_format.startswith('http'): retrieval_fn = partial(self._retrieve_remote_tile, rest_args, tile_relative_intersection, result_region) else: retrieval_fn = partial(self._retrieve_local_tile, rest_args, tile_relative_intersection, result_region) PARALLEL_REQ = True if PARALLEL_REQ: pool.add(Request(retrieval_fn)) else: # execute serially (leave the pool empty) retrieval_fn() if PARALLEL_REQ: with Timer() as timer: pool.wait() logger.info("Loading {} tiles took a total of {}".format( len(tile_starts), timer.seconds())) def _get_rest_args(self, tile_blockshape, tile_roi_in): """ For a single tile, return a dict of all possible parameters that can be substituted into the tile_url_format string from the volume json description file. tile_blockshape: The 3D blockshape of the tile (since tiles are only 1 slice thick, the blockshape always begins with 1). tile_roi_in: The ROI within the total volume for a particular tile. (Note that the size of the ROI is usually, but not always, the same as tile_blockshape. Near the volume borders, the tile_roi_in may be smaller.) """ assert sys.version_info.major == 2, "Alert! This function has not been tested "\ "under python 3. Please remove this assetion and be wary of any strnage behavior you encounter" # Special feature: # Some slices are missing, in which case we provide fake data from a different slice. # Overwrite the rest args to pull data from an alternate source tile. z_start = tile_roi_in[0][0] if z_start in self._slice_remapping: new_source_slice = self._slice_remapping[z_start] tile_roi_in[0][0] = new_source_slice tile_roi_in[1][0] = new_source_slice + 1 tile_index = numpy.array(tile_roi_in[0]) // tile_blockshape rest_args = { 'z_start': tile_roi_in[0][0], 'z_stop': tile_roi_in[1][0], 'y_start': tile_roi_in[0][1], 'y_stop': tile_roi_in[1][1], 'x_start': tile_roi_in[0][2], 'x_stop': tile_roi_in[1][2], 'z_index': tile_index[0], 'y_index': tile_index[1], 'x_index': tile_index[2] } # Apply special z_translation_function if self.description.z_translation_function is not None: z_update_func = eval(self.description.z_translation_function) rest_args['z_index'] = rest_args['z_start'] = z_update_func( rest_args['z_index']) rest_args['z_stop'] = 1 + rest_args['z_start'] # Quick sanity check assert rest_args['z_index'] == rest_args['z_start'] # Special arg for Raveler session directories: # For files with Z < 1000, no extra directory level # For files with Z >= 1000, there is an extra directory level, # in which case the extra '/' is INCLUDED here in the rest arg. raveler_z_base = (rest_args['z_index'] // 1000) * 1000 if raveler_z_base == 0: rest_args['raveler_z_base'] = "" else: rest_args['raveler_z_base'] = str(raveler_z_base) + '/' return rest_args def _retrieve_local_tile(self, rest_args, tile_relative_intersection, data_out): tile_path = self.description.tile_url_format.format(**rest_args) logger.debug("Opening {}".format(tile_path)) if not os.path.exists(tile_path): logger.error("Tile does not exist: {}".format(tile_path)) data_out[...] = 0 return # Read the image from the disk with vigra img = vigra.impex.readImage(tile_path, dtype='NATIVE') assert img.ndim == 3 if self.description.is_rgb: # "Convert" to grayscale -- just take first channel. img = img[..., 0:1] assert img.shape[-1] == 1, "Image has more channels than expected. "\ "If it is RGB, be sure to set the is_rgb flag in your description json." # img has axes xyc, but we want zyx img = img.transpose()[None, 0, :, :] if self.description.invert_y_axis: # More special Raveler support: # Raveler's conventions for the Y-axis are the reverse for everyone else's. img = img[:, ::-1, :] # Copy just the part we need into the destination array assert img[roiToSlice( *tile_relative_intersection)].shape == data_out.shape data_out[:] = img[roiToSlice(*tile_relative_intersection)] # If there's a special transform, apply it now. if self.description.data_transform_function is not None: transform = eval(self.description.data_transform_function) data_out[:] = transform(data_out) # For late imports requests = None PIL = None TEST_MODE = False # For testing purposes only. See below. def _retrieve_remote_tile(self, rest_args, tile_relative_intersection, data_out): # Late import if not TiledVolume.requests: import requests TiledVolume.requests = requests requests = TiledVolume.requests tile_url = self.description.tile_url_format.format(**rest_args) logger.debug("Retrieving {}".format(tile_url)) try: if self._session is None: self._session = self._create_session() # Provide authentication if we have the details. if self.description.username and self.description.password: self._session.auth = (self.description.username, self.description.password) success = False tries = 0 while not success: try: # Note: We give timeout as a tuple, which requires a recent version of requests. # If you get an exception about that, upgrade your requests module. r = self._session.get(tile_url, timeout=(3.0, 20.0)) success = True except requests.ConnectionError: # This special 'pass' is here because we keep running into exceptions like this: # ConnectionError: HTTPConnectionPool(host='neurocean.int.janelia.org', port=6081): # Max retries exceeded with url: /ssd-3-tiles/abd1.5/43/24_25_0.jpg # (Caused by <class 'httplib.BadStatusLine'>: '') # So now we loop a few times and only give up if something is really wrong. if tries == 5: raise # give up tries += 1 except: # During testing, the server we're pulling from might be in our own process. # Apparently that means that it is not very responsive, leading to exceptions. # As a cheap workaround, just try one more time. if self.TEST_MODE: import time time.sleep(0.01) r = self._session.get(tile_url, timeout=(3.0, 20.0)) else: raise if r.status_code == requests.codes.not_found: logger.warn("NOTFOUND: {}".format(tile_url)) data_out[:] = 0 else: # late import if not TiledVolume.PIL: import PIL import PIL.Image TiledVolume.PIL = PIL PIL = TiledVolume.PIL img = numpy.asarray(PIL.Image.open(StringIO(r.content))) if self.description.is_rgb: # "Convert" to grayscale -- just take first channel. assert img.ndim == 3 img = img[..., 0] assert img.ndim == 2, "Image seems to be of the wrong dimension. "\ "If it is RGB, be sure to set the is_rgb flag in your description json." # img has axes xy, but we want zyx img = img[None] if self.description.invert_y_axis: # More special Raveler support: # Raveler's conventions for the Y-axis are the reverse for everyone else's. img = img[:, ::-1, :] # Copy just the part we need into the destination array assert img[roiToSlice( *tile_relative_intersection)].shape == data_out.shape data_out[:] = img[roiToSlice(*tile_relative_intersection)] # If there's a special transform, apply it now. if self.description.data_transform_function is not None: transform = eval(self.description.data_transform_function) data_out[:] = transform(data_out) @classmethod def _create_session(cls): """ Generate a requests.Session object to use for this TiledVolume. Using a session allows us to benefit from a connection pool instead of establishing a new connection for every request. """ # Late import if not TiledVolume.requests: import requests TiledVolume.requests = requests requests = TiledVolume.requests session = requests.Session() # Replace the session http adapters with ones that use larger connection pools n_threads = max(1, Request.global_thread_pool.num_workers) adapter = requests.adapters.HTTPAdapter(pool_connections=n_threads, pool_maxsize=n_threads) adapter2 = requests.adapters.HTTPAdapter(pool_connections=n_threads, pool_maxsize=n_threads) session.mount('http://', adapter) session.mount('https://', adapter2) return session
class RESTfulVolume(object): """ This class provides access to data obtained via a RESTful API (e.g. from http://openconnecto.me). A description of the remote volume must be provided via a JSON file, whose schema is specified by :py:data:`RESTfulVolume.DescriptionFields`. See the unit tests in ``tests/testRESTfulVolume.py`` for example usage. .. note:: This class does not keep track of the data you've already downloaded. Every call to :py:func:`downloadSubVolume()` results in a new download. For automatic blockwise local caching of remote datasets, see :py:class:`RESTfulBlockwiseFileset`. .. note:: See the unit tests in ``tests/testRESTfulVolume.py`` for example usage. """ #: These fields describe the schema of the description file. #: See the source code comments for a description of each field. DescriptionFields = { "_schema_name": "RESTful-volume-description", "_schema_version": 1.0, "name": str, "format": str, "axes": str, "dtype": AutoEval(), "bounds": AutoEval(numpy.array), "shape": AutoEval(numpy.array ), # Provided for you. Computed as bounds - origin_offset "origin_offset": AutoEval(numpy.array), "url_format": FormattedField( requiredFields=[ "x_start", "x_stop", "y_start", "y_stop", "z_start", "z_stop" ], optionalFields=["t_start", "t_stop", "c_start", "c_stop"], ), "hdf5_dataset": str, } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): """ Parse the description file at the given path and return a :py:class:`jsonConfig.Namespace` object with the description parameters. The file will be parsed according to the schema given by :py:data:`RESTfulVolume.DescriptionFields`. Any optional parameters not provided by the user are filled in automatically. :param descriptionFilePath: The path to the description file to parse. """ # Read file description = RESTfulVolume.DescriptionSchema.parseConfigFile( descriptionFilePath) cls.updateDescription(description) return description @classmethod def updateDescription(cls, description): """ Some description fields are optional. If they aren't provided in the description JSON file, then this function provides them with default values, based on the other description fields. """ # Augment with default parameters. logger.debug(str(description)) if description.origin_offset is None: description.origin_offset = numpy.array([0] * len(description.bounds)) description.shape = description.bounds - description.origin_offset @classmethod def writeDescription(cls, descriptionFilePath, descriptionFields): """ Write a :py:class:`jsonConfig.Namespace` object to the given path. :param descriptionFilePath: The path to overwrite with the description fields. :param descriptionFields: The fields to write. """ RESTfulVolume.DescriptionSchema.writeConfigFile( descriptionFilePath, descriptionFields) def __init__(self, descriptionFilePath=None, preparsedDescription=None): """ Constructor. Uses `readDescription` interally. :param descriptionFilePath: The path to the .json file that describes the remote volume. :param preparsedDescription: (Optional) Provide pre-parsed description fields, in which case the provided description file will not be parsed. """ if preparsedDescription is not None: assert descriptionFilePath is None, "Can't provide BOTH description file and pre-parsed description fields." self.description = preparsedDescription else: assert ( descriptionFilePath is not None ), "Must provide either a description file or pre-parsed description fields" self.description = RESTfulVolume.readDescription( descriptionFilePath) # Check for errors assert False not in [ a in "txyzc" for a in self.description.axes ], "Unknown axis type. Known axes: txyzc Your axes:".format( self.description.axes) assert self.description.format == "hdf5", "Only hdf5 RESTful volumes are supported so far." assert ( self.description.hdf5_dataset is not None ), "RESTful volume description file must specify the hdf5_dataset name" if self.description.hdf5_dataset[0] != "/": self.description.hdf5_dataset = "/" + self.description.hdf5_dataset def downloadSubVolume(self, roi, outputDatasetPath): """ Download a cutout volume from the remote dataset. :param roi: The subset of the volume to download, specified as a tuple of coordinates: ``(start, stop)`` :param outputDatasetPath: The path to overwrite with the downloaded hdf5 file. """ origin_offset = numpy.array(self.description.origin_offset) accessStart = numpy.array(roi[0]) accessStart += origin_offset accessStop = numpy.array(roi[1]) accessStop += origin_offset RESTArgs = {} for axisindex, axiskey in enumerate(self.description.axes): startKey = "{}_start".format(axiskey) stopKey = "{}_stop".format(axiskey) RESTArgs[startKey] = accessStart[axisindex] RESTArgs[stopKey] = accessStop[axisindex] # Download the ROI specified in the url to a HDF5 file url = self.description.url_format.format(**RESTArgs) logger.info("Opening url for region {}..{}: {}".format( roi[0], roi[1], url)) pathComponents = PathComponents(outputDatasetPath) if pathComponents.internalPath != self.description.hdf5_dataset: # We could just open the file and rename the dataset to match what the user asked for, but that would probably be slow. # It's better just to force him to use the correct dataset name to begin with. raise RuntimeError( "The RESTful volume format uses internal dataset name '{}', but you seem to be expecting '{}'." .format(self.description.hdf5_dataset, pathComponents.internalPath)) logger.info("Downloading RESTful subvolume to file: {}".format( pathComponents.externalPath)) urllib.request.urlretrieve(url, pathComponents.externalPath) logger.info("Finished downloading file: {}".format( pathComponents.externalPath))
class BlockwiseFileset(object): """ This class handles writing and reading a 'blockwise file set'. A 'blockwise file set' is a directory with a particular structure, which contains the entire dataset broken up into blocks. Important parameters (e.g. shape, dtype, blockshape) are specified in a JSON file, which must match the schema given by :py:data:`BlockwiseFileset.DescriptionFields`. The parent directory of the description file is considered to be the top-most directory in the blockwise dataset hierarchy. - Simultaneous reads are threadsafe. - NOT threadsafe for reading and writing simultaneously (or writing and writing). - NOT threadsafe for closing. Do not call close() while reading or writing. .. note:: See the unit tests in ``tests/testBlockwiseFileset.py`` for example usage. """ #: These fields describe the schema of the description file. #: See the source code comments for a description of each field. DescriptionFields = { "_schema_name": "blockwise-fileset-description", "_schema_version": 1.1, "name": str, "format": str, "axes": str, "shape": AutoEval(numpy.array), # This is the shape of the dataset on disk "dtype": AutoEval(), "drange": AutoEval(tuple), # Optional. Data range, e.g. (0.0, 1.0) "chunks": AutoEval(numpy.array), # Optional. If null, no chunking. Only used when writing data. "compression": str, # Optional. Options include 'lzf' and 'gzip', among others. Note: h5py automatically enables chunking on compressed datasets. "compression_opts": AutoEval(int), # Optional. Hdf5-specific "block_shape": AutoEval(numpy.array), "view_origin": AutoEval( numpy.array ), # Optional. Defaults to zeros. All requests will be translated before the data is accessed. # For example, if the offset is [100, 200, 300], then a request for roi([0,0,0],[2,2,2]) # will pull from the dataset on disk as though the request was ([100,200,300],[102,202,302]). # It is an error to specify an view_origin that is not a multiple of the block_shape. "view_shape": AutoEval( numpy.array ), # Optional. Defaults to (shape - view_origin) Limits the shape of the provided data. "block_file_name_format": FormattedField( requiredFields=["roiString"] ), # For hdf5, include dataset name, e.g. myfile_block{roiString}.h5/volume/data "dataset_root_dir": str, # Abs path or relative to the description file itself. Defaults to "." if left blank. "hash_id": str, # Not user-defined (clients may use this) # Added in schema v1.1 "sub_block_shape": AutoEval(numpy.array), # Optional. Must divide evenly into the block shape. } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): """ Parse the description file at the given path and return a :py:class:`jsonConfig.Namespace` object with the description parameters. The file will be parsed according to the schema given by :py:data:`BlockwiseFileset.DescriptionFields`. :param descriptionFilePath: The path to the description file to parse. """ return BlockwiseFileset.DescriptionSchema.parseConfigFile(descriptionFilePath) @classmethod def writeDescription(cls, descriptionFilePath, descriptionFields): """ Write a :py:class:`jsonConfig.Namespace` object to the given path. :param descriptionFilePath: The path to overwrite with the description fields. :param descriptionFields: The fields to write. """ BlockwiseFileset.DescriptionSchema.writeConfigFile(descriptionFilePath, descriptionFields) class BlockNotReadyError(Exception): """ This exception is raised if `readData()` is called for data that isn't available on disk. """ def __init__(self, block_start): self.block_start = block_start @property def description(self): """ The :py:class:`jsonConfig.Namespace` object that describes this dataset. """ return self._description @classmethod def _createAndReturnBlockwiseFileset(self, descriptionFilePath, mode): try: bfs = BlockwiseFileset(descriptionFilePath, mode) except JsonConfigParser.SchemaError: bfs = None return bfs @classmethod def _prepare_system(cls): # None of this code is tested on Windows. # It might work, but you'll need to improve the unit tests to know for sure. assert ( platform.system() != "Windows" ), "This code is all untested on Windows, and probably needs some modification before it will work." # If you get a "Too many open files" error, this soft limit may need to be increased. # The way to set this limit in bash is via "ulimit -n 4096" # Fortunately, Python lets us increase the limit via the resource module. import resource softlimit, hardlimit = resource.getrlimit(resource.RLIMIT_NOFILE) softlimit = max(4096, softlimit) resource.setrlimit(resource.RLIMIT_NOFILE, (softlimit, hardlimit)) def __init__(self, descriptionFilePath, mode="r", preparsedDescription=None): """ Constructor. Uses `readDescription` interally. :param descriptionFilePath: The path to the .json file that describes the dataset. :param mode: Set to ``'r'`` if the fileset should be read-only. :param preparsedDescription: (Optional) Provide pre-parsed description fields, in which case the provided description file will not be parsed. """ self._prepare_system() assert mode == "r" or mode == "a", "Valid modes are 'r' or 'a', not '{}'".format(mode) self.mode = mode assert ( descriptionFilePath is not None ), "Must provide a path to the description file, even if you are providing pre-parsed fields. (Path is used to find block directory)." self._descriptionFilePath = descriptionFilePath if preparsedDescription is not None: self._description = preparsedDescription else: self._description = BlockwiseFileset.readDescription(descriptionFilePath) # Check for errors assert self._description.format == "hdf5", "Only hdf5 blockwise filesets are supported so far." if self._description.compression_opts is not None: assert ( self._description.compression is not None ), "You specified compression_opts={} without specifying a compression type".format( self._description.compression ) drange = self._description.drange if drange is not None: assert len(drange) == 2, "Invalid drange: {}".format(drange) assert drange[0] <= drange[1], "Invalid drange: {}".format(drange) sub_block_shape = self._description.sub_block_shape if sub_block_shape is not None: block_shape = self._description.block_shape block_shape_mods = numpy.mod(block_shape, sub_block_shape) != 0 nonfull_block_shape_dims = block_shape != self._description.view_shape invalid_sub_block_dims = numpy.logical_and(nonfull_block_shape_dims, block_shape_mods) assert (invalid_sub_block_dims == False).all(), ( "Each dimension of sub_block_shape must divide evenly into block_shape," " unless the total dataset is only one block wide in that dimension." ) # default view_origin if self._description.view_origin is None: self._description.view_origin = numpy.array((0,) * len(self._description.shape)) assert ( numpy.mod(self._description.view_origin, self._description.block_shape) == 0 ).all(), "view_origin is not compatible with block_shape. Must be a multiple!" # default view_shape if self._description.view_shape is None: self._description.view_shape = numpy.subtract(self._description.shape, self._description.view_origin) view_roi = ( self._description.view_origin, numpy.add(self._description.view_origin, self._description.view_shape), ) assert ( numpy.subtract(self._description.shape, view_roi[1]) >= 0 ).all(), "View ROI must not exceed on-disk shape: View roi: {}, on-disk shape: {}".format( view_roi, self._description.shape ) if self._description.dataset_root_dir is None: # Default to same directory as the description file self._description.dataset_root_dir = "." self._lock = threading.Lock() self._openBlockFiles = {} self._fileLocks = {} self._closed = False def __del__(self): if hasattr(self, "_closed") and not self._closed: self.close() def __enter__(self): return self def __exit__(self, *args): self.close() def close(self): """ Close all open block files. """ with self._lock: assert not self._closed paths = list(self._openBlockFiles.keys()) for path in paths: blockFile = self._openBlockFiles[path] blockFile.close() if self.mode == "a": fileLock = self._fileLocks[path] fileLock.release() self._openBlockFiles = {} self._fileLocks = {} self._closed = True def reopen(self, mode): assert self._closed, "Can't reopen a fileset that isn't closed." self.mode = mode self._closed = False def readData(self, roi, out_array=None): """ Read data from the fileset. :param roi: The region of interest to read from the dataset. Must be a tuple of iterables: (start, stop). :param out_array: The location to store the read data. Must be the correct size for the given roi. If not provided, an array is created for you. :returns: The requested data. If out_array was provided, returns out_array. """ if out_array is None: out_array = numpy.ndarray(shape=numpy.subtract(roi[1], roi[0]), dtype=self._description.dtype) roi_shape = numpy.subtract(roi[1], roi[0]) assert (roi_shape == out_array.shape).all(), "out_array must match roi shape" assert (roi_shape != 0).all(), "Requested roi {} has zero volume!".format(roi) self._transferData(roi, out_array, read=True) return out_array def writeData(self, roi, data): """ Write data to the fileset. :param roi: The region of interest to write the data to. Must be a tuple of iterables: (start, stop). :param data: The data to write. Must be the correct size for the given roi. """ assert self.mode != "r" assert (numpy.subtract(roi[1], roi[0]) != 0).all(), "Requested roi {} has zero volume!".format(roi) self._transferData(roi, data, read=False) def getDatasetDirectory(self, blockstart): """ Return the directory that contains the block that starts at the given coordinates. """ # Add the view origin to find the on-disk block coordinates blockstart = numpy.add(blockstart, self._description.view_origin) descriptionFileDir = os.path.split(self._descriptionFilePath)[0] absPath, _ = getPathVariants(self._description.dataset_root_dir, descriptionFileDir) blockFilePath = absPath for axis, start in zip(self._description.axes, blockstart): blockFilePath = os.path.join(blockFilePath, "{}_{:08d}".format(axis, start)) return blockFilePath def _getBlockFileName(self, block_start): """ Get the path to the block file that starts at the given coordinate. """ # Translate to find disk block start block_start = numpy.add(self._description.view_origin, block_start) # Get true (disk) block bounds (i.e. use on-disk shape, not view_shape) entire_block_roi = getBlockBounds(self._description.shape, self._description.block_shape, block_start) roiString = "{}".format((list(entire_block_roi[0]), list(entire_block_roi[1]))) datasetFilename = self._description.block_file_name_format.format(roiString=roiString) return datasetFilename def getDatasetPathComponents(self, block_start): """ Return a PathComponents object for the block file that corresponds to the given block start coordinate. """ datasetFilename = self._getBlockFileName(block_start) datasetDir = self.getDatasetDirectory(block_start) datasetPath = os.path.join(datasetDir, datasetFilename) return PathComponents(datasetPath) BLOCK_NOT_AVAILABLE = 0 BLOCK_AVAILABLE = 1 def getBlockStatus(self, blockstart): """ Check a block's status. (Just because a block file exists doesn't mean that it has valid data.) Returns a status code of either ``BlockwiseFileset.BLOCK_AVAILABLE`` or ``BlockwiseFileset.BLOCK_NOT_AVAILABLE``. """ blockDir = self.getDatasetDirectory(blockstart) statusFilePath = os.path.join(blockDir, "STATUS.txt") if not os.path.exists(statusFilePath): return BlockwiseFileset.BLOCK_NOT_AVAILABLE else: return BlockwiseFileset.BLOCK_AVAILABLE def isBlockLocked(self, blockstart): """ Return True if the block is locked for writing. Note that both 'available' and 'not available' blocks might be locked. """ datasetPathComponents = self.getDatasetPathComponents(blockstart) hdf5FilePath = datasetPathComponents.externalPath testLock = FileLock(hdf5FilePath) return not testLock.available() def setBlockStatus(self, blockstart, status): """ Set a block status on disk. We use a simple convention: If the status file exists, the block is available. Otherwise, it ain't. :param status: Must be either ``BlockwiseFileset.BLOCK_AVAILABLE`` or ``BlockwiseFileset.BLOCK_NOT_AVAILABLE``. """ blockDir = self.getDatasetDirectory(blockstart) statusFilePath = os.path.join(blockDir, "STATUS.txt") if status == BlockwiseFileset.BLOCK_AVAILABLE: # touch the status file. open(statusFilePath, "w").close() elif os.path.exists(statusFilePath): # Remove the status file os.remove(statusFilePath) def setBlockStatusesForRoi(self, roi, status): block_starts = getIntersectingBlocks(self._description.block_shape, roi) for block_start in block_starts: self.setBlockStatus(block_start, status) def getEntireBlockRoi(self, block_start): """ Return the roi for the entire block that starts at the given coordinate. """ return getBlockBounds(self._description.view_shape, self._description.block_shape, block_start) def getAllBlockRois(self): """ Return the list of rois for all VIEWED blocks in the dataset. """ entire_dataset_roi = ([0] * len(self._description.view_shape), self._description.view_shape) block_starts = getIntersectingBlocks(self._description.block_shape, entire_dataset_roi) rois = [] for block_start in block_starts: rois.append(self.getEntireBlockRoi(block_start)) return rois def _transferData(self, roi, array_data, read): """ Read or write data from/to the fileset. :param roi: The region of interest. :param array_data: If ``read`` is True, ``array_data`` is the destination array for the read data. If ``read`` is False, array_data contains the data to write to disk. :param read: If True, read data from the fileset into ``array_data``. Otherwise, write data from ``array_data`` into the fileset on disk. :type read: bool """ entire_dataset_roi = ([0] * len(self._description.view_shape), self._description.view_shape) clipped_roi = getIntersection(roi, entire_dataset_roi) assert ( numpy.array(clipped_roi) == numpy.array(roi) ).all(), "Roi {} does not fit within dataset bounds: {}".format(roi, self._description.view_shape) block_starts = getIntersectingBlocks(self._description.block_shape, roi) # TODO: Parallelize this loop? for block_start in block_starts: entire_block_roi = self.getEntireBlockRoi(block_start) # Roi of this whole block within the whole dataset transfer_block_roi = getIntersection( entire_block_roi, roi ) # Roi of data needed from this block within the whole dataset block_relative_roi = ( transfer_block_roi[0] - block_start, transfer_block_roi[1] - block_start, ) # Roi of needed data from this block, relative to the block itself array_data_roi = ( transfer_block_roi[0] - roi[0], transfer_block_roi[1] - roi[0], ) # Roi of data needed from this block within array_data array_slicing = roiToSlice(*array_data_roi) self._transferBlockData(entire_block_roi, block_relative_roi, array_data, array_slicing, read) def _transferBlockData(self, entire_block_roi, block_relative_roi, array_data, array_slicing, read): """ Read or write data to a single block in the fileset. :param entire_block_roi: The roi of the entire block, relative to the whole dataset. :param block_relative_roi: The roi of the data being read/written, relative to the block itself (not the whole dataset). :param array_data: Either the source or the destination of the data being transferred to/from the fileset on disk. :param read: If True, read data from the block into ``array_data``. Otherwise, write data from ``array_data`` into the block on disk. :type read: bool """ datasetPathComponents = self.getDatasetPathComponents(entire_block_roi[0]) if self._description.format == "hdf5": self._transferBlockDataHdf5( entire_block_roi, block_relative_roi, array_data, array_slicing, read, datasetPathComponents ) else: assert False, "Unknown format" def _transferBlockDataHdf5( self, entire_block_roi, block_relative_roi, array_data, array_slicing, read, datasetPathComponents ): """ Transfer a block of data to/from an hdf5 dataset. See _transferBlockData() for details. We use separate parameters for array_data and array_slicing to allow users to pass an hdf5 dataset for array_data. """ # For the hdf5 format, the full path format INCLUDES the dataset name, e.g. /path/to/myfile.h5/volume/data path_parts = datasetPathComponents datasetDir = path_parts.externalDirectory hdf5FilePath = path_parts.externalPath if len(path_parts.internalPath) == 0: raise RuntimeError( "Your hdf5 block filename format MUST specify an internal path, e.g. block{roiString}.h5/volume/blockdata" ) block_start = entire_block_roi[0] if read: # Check for problems before reading. if self.getBlockStatus(block_start) is not BlockwiseFileset.BLOCK_AVAILABLE: raise BlockwiseFileset.BlockNotReadyError(block_start) hdf5File = self._getOpenHdf5Blockfile(hdf5FilePath) if ( self._description.dtype != object and isinstance(array_data, numpy.ndarray) and array_data.flags.c_contiguous ): hdf5File[path_parts.internalPath].read_direct( array_data, roiToSlice(*block_relative_roi), array_slicing ) elif self._description.dtype == object: # We store arrays of dtype=object as arrays of pickle strings. array_pickled_data = hdf5File[path_parts.internalPath][roiToSlice(*block_relative_roi)] array_data[array_slicing] = vectorized_pickle_loads(array_pickled_data) else: array_data[array_slicing] = hdf5File[path_parts.internalPath][roiToSlice(*block_relative_roi)] else: # Create the directory if not os.path.exists(datasetDir): os.makedirs(datasetDir) # For debug purposes, output a copy of the settings # that were active **when this block was created** descriptionFileName = os.path.split(self._descriptionFilePath)[1] debugDescriptionFileCopyPath = os.path.join(datasetDir, descriptionFileName) BlockwiseFileset.writeDescription(debugDescriptionFileCopyPath, self._description) # Clear the block status. # The CALLER is responsible for setting it again. self.setBlockStatus(block_start, BlockwiseFileset.BLOCK_NOT_AVAILABLE) # Write the block data file hdf5File = self._getOpenHdf5Blockfile(hdf5FilePath) if path_parts.internalPath not in hdf5File: self._createDatasetInFile(hdf5File, path_parts.internalPath, entire_block_roi) dataset = hdf5File[path_parts.internalPath] data = array_data[array_slicing] if data.dtype != object: dataset[roiToSlice(*block_relative_roi)] = data else: # hdf5 can't handle datasets with dtype=object, # so we have to pickle each item first. pickled_data = vectorized_pickle_dumps(data) for index in numpy.ndindex(pickled_data.shape): block_index = index + numpy.array(block_relative_roi[0]) dataset[tuple(block_index)] = list(pickled_data[index]) def _createDatasetInFile(self, hdf5File, datasetName, roi): shape = tuple(roi[1] - roi[0]) chunks = self._description.chunks if chunks is not None: # chunks must not be bigger than the data in any dim chunks = numpy.minimum(chunks, shape) chunks = tuple(chunks) compression = self._description.compression compression_opts = self._description.compression_opts dtype = self._description.dtype if dtype == object: dtype = h5py.special_dtype(vlen=numpy.uint8) dataset = hdf5File.create_dataset( datasetName, shape=shape, dtype=dtype, chunks=chunks, compression=compression, compression_opts=compression_opts, ) # Set data attributes if self._description.drange is not None: dataset.attrs["drange"] = self._description.drange if _use_vigra: dataset.attrs["axistags"] = vigra.defaultAxistags(str(self._description.axes)).toJSON() def _getOpenHdf5Blockfile(self, blockFilePath): """ Return a handle to the open hdf5File at the given path. If we haven't opened the file yet, open it first. """ # Try once without locking if blockFilePath in list(self._openBlockFiles.keys()): return self._openBlockFiles[blockFilePath] # Obtain the lock and try again with self._lock: if blockFilePath not in list(self._openBlockFiles.keys()): try: writeLock = FileLock(blockFilePath, timeout=10) if self.mode == "a": acquired = writeLock.acquire(blocking=False) assert acquired, "Couldn't obtain an exclusive lock for writing to file: {}".format( blockFilePath ) self._fileLocks[blockFilePath] = writeLock elif self.mode == "r": assert writeLock.available(), "Can't read from a file that is being written to elsewhere." else: assert False, "Unsupported mode" self._openBlockFiles[blockFilePath] = h5py.File(blockFilePath, self.mode) except: log_exception(logger, "Couldn't open {}".format(blockFilePath)) raise return self._openBlockFiles[blockFilePath] def getOpenHdf5FileForBlock(self, block_start): """ Returns a handle to a file in this dataset. """ block_start = tuple(block_start) path_components = self.getDatasetPathComponents(block_start) return self._getOpenHdf5Blockfile(path_components.externalPath) def purgeAllLocks(self): """ Clears all .lock files from the local blockwise fileset. This may be necessary if previous processes crashed or were killed while some blocks were downloading. You must ensure that this is NOT called while more than one process (or thread) has access to the fileset. For example, in a master/worker situation, call this only from the master, before the workers have been started. """ found_lock = False view_shape = self.description.view_shape view_roi = ([0] * len(view_shape), view_shape) block_starts = list(getIntersectingBlocks(self.description.block_shape, view_roi)) for block_start in block_starts: blockFilePathComponents = self.getDatasetPathComponents(block_start) fileLock = FileLock(blockFilePathComponents.externalPath) found_lock |= fileLock.purge() if found_lock: logger.warning("Purged lock for block: {}".format(tuple(block_start))) return found_lock def exportRoiToHdf5(self, roi, exportDirectory, use_view_coordinates=True): """ Export an arbitrary roi to a single hdf5 file. The file will be placed in the given exportDirectory, and will be named according to the exported roi. :param roi: The roi to export :param exportDirectory: The directory in which the result should be placed. :param use_view_coordinates: If True, assume the roi was given relative to the view start. Otherwise, assume it was given relative to the on-disk coordinates. """ roi = list(map(TinyVector, roi)) if not use_view_coordinates: abs_roi = roi assert ( abs_roi[0] >= self.description.view_origin ), "Roi {} is out-of-bounds: must not span lower than the view origin: ".format( roi, self.description.origin ) view_roi = roi - self.description.view_origin else: view_roi = roi abs_roi = view_roi + self.description.view_origin # Always name the file according to the absolute roi roiString = "{}".format((list(abs_roi[0]), list(abs_roi[1]))) datasetPath = self._description.block_file_name_format.format(roiString=roiString) fullDatasetPath = os.path.join(exportDirectory, datasetPath) path_parts = PathComponents(fullDatasetPath) with h5py.File(path_parts.externalPath, "w") as f: self._createDatasetInFile(f, path_parts.internalPath, view_roi) dataset = f[path_parts.internalPath] self.readData(view_roi, dataset) return fullDatasetPath def exportSubset(self, roi, exportDirectory, use_view_coordinates=True): """ Create a new blockwise fileset by copying a subset of this blockwise fileset. :param roi: The portion to export. Must be along block boundaries, in ABSOLUTE coordinates. :param exportDirectory: The directory to copy the new blockwise fileset to. """ # For now, this implementation assumes it can simply copy EVERYTHING in the block directories, # including lock files. Therefore, we require that the fileset be opened in read-only mode. # If that's a problem, change this function to ignore lock files when copying (or purge them afterwards). roi = list(map(TinyVector, roi)) if not use_view_coordinates: abs_roi = roi assert ( abs_roi[0] >= self.description.view_origin ), "Roi {} is out-of-bounds: must not span lower than the view origin: ".format( roi, self.description.origin ) else: abs_roi = roi + self.description.view_origin assert self.mode == "r", "Can't export from a fileset that is open in read/write mode." block_shape = self._description.block_shape abs_shape = self._description.shape view_origin = self._description.view_origin assert (abs_roi[0] % block_shape == 0).all(), "exportSubset() requires roi to start on a block boundary" assert ( (abs_roi[1] % block_shape == 0) | (abs_roi[1] == abs_shape) ).all(), "exported subset must end on block or dataset boundary." if not os.path.exists(exportDirectory): os.makedirs(exportDirectory) source_desc_path = self._descriptionFilePath source_desc_dir, source_desc_filename = os.path.split(source_desc_path) source_root_dir = self.description.dataset_root_dir # Copy/update description file dest_desc_path = os.path.join(exportDirectory, source_desc_filename) if os.path.exists(dest_desc_path): dest_description = BlockwiseFileset.readDescription(dest_desc_path) else: dest_description = copy.copy(self._description) dest_description.view_shape = abs_roi[1] - view_origin dest_description.hash_id = None BlockwiseFileset.writeDescription(dest_desc_path, dest_description) # Determine destination root block dir if os.path.isabs(source_root_dir): source_root_dir = os.path.normpath(source_root_dir) source_root_dir_name = os.path.split(source_root_dir)[1] dest_root_dir = os.path.join(exportDirectory, source_root_dir_name) else: dest_root_dir = os.path.join(exportDirectory, source_root_dir) source_root_dir, _ = getPathVariants(source_root_dir, source_desc_dir) view_roi = abs_roi - view_origin block_starts = getIntersectingBlocks(block_shape, view_roi) for block_start in block_starts: source_block_dir = self.getDatasetDirectory(block_start) rel_block_dir = os.path.relpath(source_block_dir, source_root_dir) dest_block_dir = os.path.join(dest_root_dir, rel_block_dir) if os.path.exists(dest_block_dir): logger.info("Skipping existing block directory: {}".format(dest_block_dir)) elif not os.path.exists(source_block_dir): logger.info("Skipping missing block directory: {}".format(source_block_dir)) else: # Copy the entire block directory assert dest_block_dir[-1] != "/" dest_block_dir_parent = os.path.split(dest_block_dir)[0] if not os.path.exists(dest_block_dir_parent): os.makedirs(dest_block_dir_parent) shutil.copytree(source_block_dir, dest_block_dir) return dest_desc_path
class TestJsonConfig(object): SubConfigSchema = \ { "_schema_name" : "sub-schema", "_schema_version" : 1.1, "sub_settingA" : str, "sub_settingB" : str } TestSchema = \ { "_schema_name" : "test-schema", "_schema_version" : 1.1, "string_setting" : str, "int_setting" : int, "auto_int_setting" : AutoEval(int), "another_auto_int_setting" : AutoEval(int), "bool_setting" : bool, "formatted_setting" : FormattedField( requiredFields=["user_name", "user_home_town"]), "array_setting" : numpy.array, "array_from_string_setting" : AutoEval(numpy.array), "roi_setting" : RoiTuple(), "subconfig" : JsonConfigParser(SubConfigSchema) } @classmethod def setupClass(cls): testConfig = \ """ { "_schema_name" : "test-schema", "_schema_version" : 1.0, "string_setting" : "This is a sentence.", "int_setting" : 42, "auto_int_setting" : "7*6", "another_auto_int_setting" : 43, "bool_setting" : true, "formatted_setting" : "Greetings, {user_name} from {user_home_town}!", "array_setting" : [1,2,3,4], "array_from_string_setting" : "[1, 1*2, 1*3, 1*4]", "roi_setting" : [[1,2,3,4,5], [6,7,8,9,10]], "subconfig" : { "_schema_name" : "sub-schema", "_schema_version" : 1.0, "sub_settingA" : "yes", "sub_settingB" : "no" } } """ cls.tempDir = tempfile.mkdtemp() cls.configpath = os.path.join(cls.tempDir, "config.json") logger.debug("Using config file: " + cls.configpath) with open(cls.configpath, 'w') as f: f.write(testConfig) @classmethod def teardownClass(cls): # If the user is debugging, don't delete the test files. if logger.level > logging.DEBUG: shutil.rmtree(cls.tempDir) def testRead(self): configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath ) assert configFields.string_setting == "This is a sentence." assert configFields.int_setting == 42 assert configFields.auto_int_setting == 42 assert configFields.another_auto_int_setting == 43 assert configFields.bool_setting is True assert configFields.formatted_setting.format( user_name="Stuart", user_home_town="Washington, DC" ) == "Greetings, Stuart from Washington, DC!" assert configFields.roi_setting == ((1,2,3,4,5), (6,7,8,9,10)) assert isinstance(configFields.array_setting, numpy.ndarray) assert (configFields.array_setting == [1,2,3,4]).all() assert isinstance(configFields.array_from_string_setting, numpy.ndarray) assert (configFields.array_from_string_setting == [1,2,3,4]).all() # Check sub-config settings assert configFields.subconfig.sub_settingA == "yes" assert configFields.subconfig.sub_settingB == "no" def testWrite(self): configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( TestJsonConfig.configpath ) configFields.string_setting = "This is a different sentence." configFields.int_setting = 100 configFields.bool_setting = False # Write it. newConfigFilePath = TestJsonConfig.configpath + "_2" JsonConfigParser( TestJsonConfig.TestSchema ).writeConfigFile( newConfigFilePath, configFields ) # Read it back. newConfigFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( newConfigFilePath ) assert newConfigFields == configFields, "Config field content was not preserved after writing/reading" assert list(configFields.__dict__.items()) == list(configFields.__dict__.items()), "Config field ORDER was not preserved after writing/reading" @nose.tools.raises( JsonConfigParser.ParsingError ) def testExceptionIfRepeatedFields(self): """ This test creates a config that has an error: A field has been repeated. We expect to see an exception from the parser telling us that we screwed up. (See decorator above.) """ testConfig = \ """ { "_schema_name" : "test-schema", "_schema_version" : 1.0, "string_setting" : "First instance", "string_setting" : "Repeated instance" } """ tempDir = tempfile.mkdtemp() configpath = os.path.join(tempDir, "config.json") logger.debug("Using config file: " + configpath) with open(configpath, 'w') as f: f.write(testConfig) try: configFields = JsonConfigParser( TestJsonConfig.TestSchema ).parseConfigFile( configpath ) finally: # Clean up temporary file shutil.rmtree(tempDir)
ClusterConfigFields = \ { "_schema_name" : "cluster-execution-configuration", "_schema_version" : 1.0, "workflow_type" : str, "output_slot_id" : str, "sys_tmp_dir" : str, "task_subrequest_shape" : dict, # Optional. Output description sub_block_shape overrides this now. "task_parallel_subrequests" : AutoEval(int), "task_threadpool_size" : AutoEval(int), "task_timeout_secs" : AutoEval(int), "use_node_local_scratch" : bool, "use_master_local_scratch" : bool, "node_output_compression_cmd" : FormattedField( requiredFields=["compressed_file", "uncompressed_file"]), "node_output_decompression_cmd" : FormattedField( requiredFields=["compressed_file", "uncompressed_file"]), "task_progress_update_command" : FormattedField( requiredFields=["progress"] ), "task_launch_server" : str, "output_log_directory" : str, "server_working_directory" : str, "command_format" : FormattedField( requiredFields=["task_args"], optionalFields=["task_name"] ), "debug_option_use_previous_node_files" : bool } def parseClusterConfigFile(configFilePath): """ Convenience function for parsing cluster configs. Returns a Namespace object. (Similar to the behavior of argparse.ArgumentParser.parse_args() )
class TiledVolume(object): """ Given a directory of image tiles that make up a volume, produces numpy array volumes for arbitrary roi requests. """ #: These fields describe the schema of the description file. #: See the source code comments for a description of each field. DescriptionFields = \ { "_schema_name" : "tiled-volume-description", "_schema_version" : 1.0, "name" : str, "format" : str, "dtype" : AutoEval(), "bounds_zyx" : AutoEval(numpy.array), "shape_zyx" : AutoEval(numpy.array), # synonym for bounds_zyx (until we support offset_origin) "resolution_zyx" : AutoEval(numpy.array), "tile_shape_2d_yx" : AutoEval(numpy.array), # This doesn't change how the data is read from the server, # but instead specifies the indexing order of the numpy volumes produced. "output_axes" : str, "cache_tiles" : bool, # Offset not supported for now... #"origin_offset" : AutoEval(numpy.array), # For now, 3D-only, sliced across Z # TODO: support 5D. # Allow multiple url schemes: tiles might be addressed via pixel coordinates or row/column indexing # (z_index and z_start are synonyms here -- either is allowed) "tile_url_format" : FormattedField( requiredFields=[], optionalFields=["x_start", "y_start", "z_start", "x_stop", "y_stop", "z_stop", "x_index", "y_index", "z_index"] ), "extend_slices" : list } DescriptionSchema = JsonConfigParser(DescriptionFields) @classmethod def readDescription(cls, descriptionFilePath): # Read file description = TiledVolume.DescriptionSchema.parseConfigFile( descriptionFilePath) cls.updateDescription(description) return description @classmethod def updateDescription(cls, description): """ Some description fields are optional. If they aren't provided in the description JSON file, then this function provides them with default values, based on the other description fields. """ # Augment with default parameters. logger.debug(str(description)) # offset not supported yet... #if description.origin_offset is None: # description.origin_offset = numpy.array( [0]*len(description.bounds_zyx) ) #description.shape = description.bounds_zyx - description.origin_offset # for now, there's no difference between shape and bounds if description.shape_zyx is not None and description.bounds_zyx is not None: assert all(description.shape_zyx == description.bounds_zyx) if description.shape_zyx is None: description.shape_zyx = tuple(description.bounds_zyx) if description.bounds_zyx is None: description.bounds_zyx = tuple(description.shape_zyx) if not description.output_axes: description.output_axes = "zyx" assert description.output_axes is None or set(description.output_axes) == set("zyx"), \ "Axis order must include x,y,z (and nothing else)" if not description.extend_slices: description.extend_slices = [] if description.cache_tiles is None: description.cache_tiles = False def __init__(self, descriptionFilePath): self.description = TiledVolume.readDescription(descriptionFilePath) self._session = None assert self.description.format in vigra.impex.listExtensions().split(), \ "Unknown tile format: {}".format( self.description.format ) assert self.description.tile_shape_2d_yx.shape == (2, ) assert self.description.bounds_zyx.shape == (3, ) shape_dict = dict(zip('zyx', self.description.bounds_zyx)) self.output_shape = tuple(shape_dict[k] for k in self.description.output_axes) self._slice_remapping = {} for source, destinations in self.description.extend_slices: for dest in destinations: self._slice_remapping[dest] = source def close(self): self._session.close() def read(self, roi, result_out): """ roi: (start, stop) tuples, ordered according to description.output_axes """ output_axes = self.description.output_axes roi_transposed = zip(*roi) roi_dict = dict(zip(output_axes, roi_transposed)) roi = zip(*(roi_dict['z'], roi_dict['y'], roi_dict['x'])) # First, normalize roi and result to zyx order result_out = vigra.taggedView(result_out, output_axes) result_out = result_out.withAxes(*'zyx') assert numpy.array(roi).shape == ( 2, 3), "Invalid roi for 3D volume: {}".format(roi) roi = numpy.array(roi) assert (result_out.shape == (roi[1] - roi[0])).all() tile_blockshape = (1, ) + tuple(self.description.tile_shape_2d_yx) tile_starts = getIntersectingBlocks(tile_blockshape, roi) # We use a fresh tmp dir for each read to avoid conflicts between parallel reads tmpdir = tempfile.mkdtemp() pool = RequestPool() for tile_start in tile_starts: tile_roi_in = getBlockBounds(self.description.shape_zyx, tile_blockshape, tile_start) tile_roi_in = numpy.array(tile_roi_in) # This tile's portion of the roi intersecting_roi = getIntersection(roi, tile_roi_in) intersecting_roi = numpy.array(intersecting_roi) # Compute slicing within destination array and slicing within this tile destination_relative_intersection = numpy.subtract( intersecting_roi, roi[0]) tile_relative_intersection = intersecting_roi - tile_roi_in[0] # Get a view to the output slice result_region = result_out[roiToSlice( *destination_relative_intersection)] # Special feature: # Some slices are missing, in which case we provide fake data from a different slice. # Overwrite the rest args to pull data from an alternate source tile. z_start = tile_roi_in[0][0] if z_start in self._slice_remapping: new_source_slice = self._slice_remapping[z_start] tile_roi_in[0][0] = new_source_slice tile_roi_in[1][0] = new_source_slice + 1 tile_index = numpy.array(tile_roi_in[0]) / tile_blockshape rest_args = { 'z_start': tile_roi_in[0][0], 'z_stop': tile_roi_in[1][0], 'y_start': tile_roi_in[0][1], 'y_stop': tile_roi_in[1][1], 'x_start': tile_roi_in[0][2], 'x_stop': tile_roi_in[1][2], 'z_index': tile_index[0], 'y_index': tile_index[1], 'x_index': tile_index[2] } # Quick sanity check assert rest_args['z_index'] == rest_args['z_start'] retrieval_fn = partial(self._retrieve_tile, tmpdir, rest_args, tile_relative_intersection, result_region) PARALLEL_REQ = True if PARALLEL_REQ: pool.add(Request(retrieval_fn)) else: # execute serially (leave the pool empty) retrieval_fn() pool.wait() # Clean up our temp files. shutil.rmtree(tmpdir) # For late imports requests = None PIL = None TEST_MODE = False # For testing purposes only. See below. def _retrieve_tile(self, tmpdir, rest_args, tile_relative_intersection, data_out): # Late import if not TiledVolume.requests: import requests TiledVolume.requests = requests requests = TiledVolume.requests tile_url = self.description.tile_url_format.format(**rest_args) tmp_filename = 'z{z_start}_y{y_start}_x{x_start}'.format(**rest_args) tmp_filename += '.' + self.description.format tmp_filepath = os.path.join(tmpdir, tmp_filename) logger.debug("Retrieving {}".format(tile_url)) try: if self._session is None: self._session = self._create_session() success = False tries = 0 while not success: try: r = self._session.get(tile_url) success = True except requests.ConnectionError: # This special 'pass' is here because we keep running into exceptions like this: # ConnectionError: HTTPConnectionPool(host='neurocean.int.janelia.org', port=6081): # Max retries exceeded with url: /ssd-3-tiles/abd1.5/43/24_25_0.jpg # (Caused by <class 'httplib.BadStatusLine'>: '') # So now we loop a few times and only give up if something is really wrong. if tries == 5: raise # give up tries += 1 except: # During testing, the server we're pulling from might be in our own process. # Apparently that means that it is not very responsive, leading to exceptions. # As a cheap workaround, just try one more time. if self.TEST_MODE: import time time.sleep(0.01) r = self._session.get(tile_url) else: raise if r.status_code == requests.codes.not_found: logger.warn("NOTFOUND: {}".format(tile_url, tmp_filepath)) data_out[:] = 0 else: USE_PIL = True if USE_PIL: # late import if not TiledVolume.PIL: import PIL import PIL.Image TiledVolume.PIL = PIL PIL = TiledVolume.PIL img = numpy.asarray(PIL.Image.open(StringIO(r.content))) assert img.ndim == 2 # img has axes xy, but we want zyx img = img[None] #img = img.transpose()[None] else: logger.debug("saving to {}".format(tmp_filepath)) with open(tmp_filepath, 'wb') as f: CHUNK_SIZE = 10 * 1024 for chunk in r.iter_content(CHUNK_SIZE): f.write(chunk) # Read the image from the disk with vigra img = vigra.impex.readImage(tmp_filepath, dtype='NATIVE') assert img.ndim == 3 assert img.shape[-1] == 1 # img has axes xyc, but we want zyx img = img.transpose()[None, 0, :, :] # Copy just the part we need into the destination array assert img[roiToSlice( *tile_relative_intersection)].shape == data_out.shape data_out[:] = img[roiToSlice(*tile_relative_intersection)] @classmethod def _create_session(cls): """ Generate a requests.Session object to use for this TiledVolume. Using a session allows us to benefit from a connection pool instead of establishing a new connection for every request. """ # Late import if not TiledVolume.requests: import requests TiledVolume.requests = requests requests = TiledVolume.requests session = requests.Session() # Replace the session http adapters with ones that use larger connection pools n_threads = Request.global_thread_pool.num_workers adapter = requests.adapters.HTTPAdapter(pool_connections=n_threads, pool_maxsize=n_threads) adapter2 = requests.adapters.HTTPAdapter(pool_connections=n_threads, pool_maxsize=n_threads) session.mount('http://', adapter) session.mount('https://', adapter2) return session