def read(self, path: str, s3_kwargs: Dict[str, Any] = None, s3_client_kwargs: Dict[str, Any] = None, max_cache_size: int = None, **kwargs) -> xr.Dataset: """ Read dataset from some Zarr storage. :param path: File path or object storage URL. :param s3_kwargs: if *path* is an object storage URL, keyword-arguments passed to S3 file system, that is ``s3fs.S3FileSystem(**s3_kwargs, ...)``. :param s3_client_kwargs: if *path* is an object storage URL, keyword-arguments passed to S3 (boto3) client, that is ``s3fs.S3FileSystem(..., client_kwargs=s3_client_kwargs)``. :param max_cache_size: if this is a positive integer, the store will be wrapped in an in-memory cache, that is ``store = zarr.LRUStoreCache(store, max_size=max_cache_size)``. :param kwargs: Keyword-arguments passed to xarray Zarr adapter, that is ``xarray.open_zarr(..., **kwargs)``. In addition, the parameter ** :return: """ path_or_store = path consolidated = False if isinstance(path, str): path_or_store, consolidated = get_path_or_s3_store( path_or_store, s3_kwargs=s3_kwargs, s3_client_kwargs=s3_client_kwargs, mode='r') if max_cache_size is not None and max_cache_size > 0: path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size) return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
def open( self, mode: str = "r", cached: bool = True, cache_size_bytes: int = int(1e9)) -> "ChunkedDataset": """Opens a zarr dataset from disk from the path supplied in the constructor. :param mode: Mode to open dataset in, default to read-only (default: {"r"}) :param cached: Whether to cache files read from disk using a LRU cache. (default: {True}) :param cache_size_bytes: Size of cache in bytes (default: {1e9} (1GB)) """ if cached: self.root = zarr.open_group(store=zarr.LRUStoreCache( zarr.DirectoryStore(self.path), max_size=cache_size_bytes), mode=mode) else: self.root = zarr.open_group(self.path, mode=mode) self.frames = self.root[FRAME_ARRAY_KEY] self.agents = self.root[AGENT_ARRAY_KEY] self.scenes = self.root[SCENE_ARRAY_KEY] try: self.tl_faces = self.root[TL_FACE_ARRAY_KEY] except KeyError: # the real issue here is that frame doesn't have traffic_light_faces_index_interval warnings.warn( f"{TL_FACE_ARRAY_KEY} not found in {self.path}! " f"You won't be able to use this zarr into an Ego/AgentDataset", RuntimeWarning, stacklevel=2, ) self.tl_faces = np.empty((0, ), dtype=TL_FACE_DTYPE) return self
def open_ml_dataset_from_object_storage( ctx: ServiceContext, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_id}") data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR) s3_client_kwargs = {} if 'Endpoint' in dataset_descriptor: s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint'] if 'Region' in dataset_descriptor: s3_client_kwargs['region_name'] = dataset_descriptor['Region'] obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=s3_client_kwargs) if data_format == FORMAT_NAME_ZARR: store = s3fs.S3Map(root=path, s3=obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time(tag=f"opened remote zarr dataset {path}"): ds = xr.open_zarr(cached_store) return BaseMultiLevelDataset(ds) if data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened remote levels dataset {path}"): return ObjectStorageMultiLevelDataset( ds_id, obs_file_system, path, exception_type=ServiceConfigError)
def _get_dataset_lazily(self, index: int, **zarr_kwargs) -> xr.Dataset: """ Read the dataset for the level at given *index*. :param index: the level index :param zarr_kwargs: kwargs passed to xr.open_zarr() :return: the dataset for the level at *index*. """ ext, level_path = self._level_paths[index] if ext == ".link": with self._obs_file_system.open(level_path, "w") as fp: level_path = fp.read() # if file_path is a relative path, resolve it against the levels directory if not os.path.isabs(level_path): base_dir = os.path.dirname(self._dir_path) level_path = os.path.join(base_dir, level_path) store = s3fs.S3Map(root=level_path, s3=self._obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time( tag=f"opened remote dataset {level_path} for level {index}"): return assert_cube(xr.open_zarr(cached_store, **zarr_kwargs), name=level_path)
def read(self, path: str, **kwargs) -> xr.Dataset: path_or_store = path if isinstance(path, str): endpoint_url = None root = None if 'endpoint_url' in kwargs: endpoint_url = kwargs.pop('endpoint_url') root = path if path.startswith("http://") or path.startswith("https://"): import urllib3.util url = urllib3.util.parse_url(path_or_store) if url.port is not None: endpoint_url = f'{url.scheme}://{url.host}:{url.port}' else: endpoint_url = f'{url.scheme}://{url.host}' root = url.path if root.startswith('/'): root = root[1:] if endpoint_url and root is not None: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url=endpoint_url)) path_or_store = s3fs.S3Map(root=root, s3=s3, check=False) if 'max_cache_size' in kwargs: max_cache_size = kwargs.pop('max_cache_size') if max_cache_size > 0: path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size) return xr.open_zarr(path_or_store, **kwargs)
def open(self, mode: str = "r", cached: bool = True, cache_size_bytes: int = int(1e9)) -> "ChunkedDataset": """Opens a zarr dataset from disk from the path supplied in the constructor. Keyword Arguments: mode (str): Mode to open dataset in, default to read-only (default: {"r"}) cached (bool): Whether to cache files read from disk using a LRU cache. (default: {True}) cache_size_bytes (int): Size of cache in bytes (default: {1e9} (1GB)) Raises: Exception: When any of the expected arrays (frames, agents, scenes) is missing or the store couldn't be opened. """ if cached: self.root = zarr.open_group( store=zarr.LRUStoreCache(zarr.DirectoryStore(self.path), max_size=cache_size_bytes), mode=mode ) else: self.root = zarr.open_group(self.path, mode=mode) self.frames = self.root[FRAME_ARRAY_KEY] self.agents = self.root[AGENT_ARRAY_KEY] self.scenes = self.root[SCENE_ARRAY_KEY] try: self.tl_faces = self.root[TL_FACE_ARRAY_KEY] except KeyError: warnings.warn( f"{TL_FACE_ARRAY_KEY} not found in {self.path}! Traffic lights will be disabled", RuntimeWarning, stacklevel=2, ) self.tl_faces = np.empty((0,), dtype=TL_FACE_DTYPE) return self
def open_cube(cube_config: CubeConfig, observer: Callable = None, trace_store_calls: bool = False, max_cache_size: int = 2 ** 30, sentinel_hub: SentinelHub = None, **sh_kwargs) -> xr.Dataset: """ Open a data cube from SentinelHub. This is a facade function that hides the details of opening a volatile data cube from SentinelHub. :param cube_config: The cube configuration. :param observer: A observer function or callable that is called on every request made to SentinelHub. :param trace_store_calls: Whether to trace and dump calls made into the Zarr store. :param max_cache_size: Cache size in bytes. Defaults to 1 GB. If zero or None, no caching takes place: :param sentinel_hub: Optional instance of SentinelHub, the object representing the SENTINEL Hub API. :param sh_kwargs: Optional keyword arguments passed to the SentinelHub constructor. Only valid if *sentinel_hub* is not given. :return: the data cube represented by an xarray Dataset object. """ if sentinel_hub is None: sentinel_hub = SentinelHub(**sh_kwargs) elif sh_kwargs: raise ValueError(f'unexpected keyword-arguments: {", ".join(sh_kwargs.keys())}') cube_store = SentinelHubChunkStore(sentinel_hub, cube_config, observer=observer, trace_store_calls=trace_store_calls) if max_cache_size: cube_store = zarr.LRUStoreCache(cube_store, max_cache_size) return xr.open_zarr(cube_store)
def read(self, path: str, **kwargs) -> xr.Dataset: path_or_store = path consolidated = False if isinstance(path, str): region_name = None if 'endpoint_url' in kwargs: endpoint_url = kwargs.pop('endpoint_url') root = path else: endpoint_url, root = split_bucket_url(path) if 'region_name' in kwargs: region_name = kwargs.pop('region_name') if endpoint_url and root: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict( endpoint_url=endpoint_url, region_name=region_name)) consolidated = s3.exists(f'{root}/.zmetadata') path_or_store = s3fs.S3Map(root=root, s3=s3, check=False) if 'max_cache_size' in kwargs: max_cache_size = kwargs.pop('max_cache_size') if max_cache_size > 0: path_or_store = zarr.LRUStoreCache( path_or_store, max_size=max_cache_size) else: consolidated = os.path.exists( os.path.join(path_or_store, '.zmetadata')) return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
def _get_dataset_lazily(self, index: int, parameters: Dict[str, Any]) -> xr.Dataset: """ Read the dataset for the level at given *index*. :param index: the level index :param parameters: keyword arguments passed to xr.open_zarr() :return: the dataset for the level at *index*. """ ext, level_path = self._level_paths[index] if ext == ".link": with self._s3_file_system.open(level_path, "w") as fp: level_path = fp.read() # if file_path is a relative path, resolve it against the levels directory if not os.path.isabs(level_path): base_dir = os.path.dirname(self._dir_path) level_path = os.path.join(base_dir, level_path) store = s3fs.S3Map(root=level_path, s3=self._s3_file_system, check=False) max_size = self.get_chunk_cache_capacity(index) if max_size: store = zarr.LRUStoreCache(store, max_size=max_size) with measure_time( tag=f"opened remote dataset {level_path} for level {index}"): consolidated = self._s3_file_system.exists( f'{level_path}/.zmetadata') return assert_cube(xr.open_zarr(store, consolidated=consolidated, **parameters), name=level_path)
def read(self, path: str, **kwargs) -> xr.Dataset: path_or_store = path consolidated = False mode = 'read' root = None if isinstance(path, str): client_kwargs = {} if 'client_kwargs' in kwargs: client_kwargs = kwargs.pop('client_kwargs') if 'endpoint_url' in kwargs: client_kwargs['endpoint_url'] = kwargs.pop('endpoint_url') root = path if 'region_name' in kwargs: client_kwargs['region_name'] = kwargs.pop('region_name') path_or_store, root, client_kwargs = _get_path_or_store(path_or_store, client_kwargs, mode, root) if 'endpoint_url' in client_kwargs and root is not None: s3 = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs) consolidated = s3.exists(f'{root}/.zmetadata') path_or_store = s3fs.S3Map(root=root, s3=s3, check=False) if 'max_cache_size' in kwargs: max_cache_size = kwargs.pop('max_cache_size') if max_cache_size > 0: path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size) else: consolidated = os.path.exists(os.path.join(path_or_store, '.zmetadata')) return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
def open_zarr(s3_path, anon=False, cache=False): """Open a zarr archive and return its root.""" s3 = s3fs.S3FileSystem(anon=anon) store = s3fs.S3Map(s3_path, s3=s3, check=False, create=False) if cache: lrucache = zarr.LRUStoreCache(store=store, max_size=1 << 29) root = zarr.group(store=lrucache) else: root = zarr.group(store=store) return root
def get_storage_map(url: str, creds: dict = None, memcache: float = None): fs, path, store = _get_storage_map(url, creds) # TODO: Make use that fs.listdir and store.get do not cache locally filenames, # because in that case if something is added to s3 or gcs it won't be notified by the program if (store.get(".zarray") is None and store.get(".zgroup") is None and len(fs.listdir(path)) > 0): raise NotZarrFolderException( "This url is not empty but not zarr url either, for safety reasons refusing to overwrite this folder" ) return store if not memcache else zarr.LRUStoreCache( store, memcache * (2**20))
def open_data(self, data_id: str, **open_params) -> Any: cci_schema = self.get_open_data_params_schema(data_id) cci_schema.validate_instance(open_params) cube_kwargs, open_params = cci_schema.process_kwargs_subset( open_params, ('variable_names', 'time_range', 'bbox')) max_cache_size: int = 2**30 chunk_store = CciChunkStore(self._cci_odp, data_id, cube_kwargs) if max_cache_size: chunk_store = zarr.LRUStoreCache(chunk_store, max_cache_size) ds = xr.open_zarr(chunk_store) ds = self._normalize_dataset(ds, cci_schema, **open_params) return ds
def __init__(self, store, cache_size=128 * (1024**2)): # don't cache meta-data read once self.cache = zarr.LRUStoreCache(store, max_size=cache_size) self.root = zarr.open(self.cache, mode="r") meta_data, recmd, time_seconds = load_meta(self.root) self.depth = recmd self.time_seconds = time_seconds self.sample_events = self.root["sample_events"][:] self.segy_filenames = self.root["segy_filenames"][:]
def read(self, path: str, client_kwargs: Dict[str, Any] = None, **kwargs) -> xr.Dataset: path_or_store = path consolidated = False if isinstance(path, str): path_or_store, consolidated = get_path_or_obs_store(path_or_store, client_kwargs, mode='r') if 'max_cache_size' in kwargs: max_cache_size = kwargs.pop('max_cache_size') if max_cache_size > 0: path_or_store = zarr.LRUStoreCache(path_or_store, max_size=max_cache_size) return xr.open_zarr(path_or_store, consolidated=consolidated, **kwargs)
def open_from_obs(path: str, endpoint_url: str = None, max_cache_size: int = 2**28) -> xr.Dataset: """ Open an xcube (xarray dataset) from S3 compatible object storage (OBS). :param path: Path having format "<bucket>/<my>/<sub>/<path>" :param endpoint_url: Optional URL of the OBS service endpoint. If omitted, AWS S3 service URL is used. :param max_cache_size: If > 0, size of a memory cache in bytes, e.g. 2**30 = one giga bytes. If None or size <= 0, no memory cache will be used. :return: an xarray dataset """ s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(endpoint_url=endpoint_url)) store = s3fs.S3Map(root=path, s3=s3, check=False) if max_cache_size is not None and max_cache_size > 0: store = zarr.LRUStoreCache(store, max_size=max_cache_size) return xr.open_zarr(store)
def load_dask_array_from_s3(plate_id, index, resolution='0'): cache_size_mb = 2048 cfg = { 'anon': True, 'client_kwargs': { 'endpoint_url': 'https://minio-dev.openmicroscopy.org/', }, 'root': 'idr/zarr/v0.1-extra/plate-%s.zarr/%s/%s' % (plate_id, index, resolution) } s3 = s3fs.S3FileSystem( anon=cfg['anon'], client_kwargs=cfg['client_kwargs'], ) store = s3fs.S3Map(root=cfg['root'], s3=s3, check=False) cached_store = zarr.LRUStoreCache(store, max_size=(cache_size_mb * 2**20)) # data.shape is (t, c, z, y, x) by convention return da.from_zarr(cached_store)
def __getattr__(self, name: str) -> Any: if name in self._cube_config: if name in self._dataset_cache: return self._dataset_cache[name] else: dataset_descriptor = self._cube_config[name] fs_type = dataset_descriptor.get("FileSystem", "local") path = dataset_descriptor.get('Path') ds = None if not path: print("Missing 'path' entry in dataset descriptor") if fs_type == 'obs': data_format = dataset_descriptor.get('Format', 'zarr') if data_format != 'zarr': print("Invalid format=" + data_format + "!r} in dataset descriptor ") client_kwargs = {} if 'Endpoint' in dataset_descriptor: client_kwargs['endpoint_url'] = dataset_descriptor[ 'Endpoint'] if 'Region' in dataset_descriptor: client_kwargs['region_name'] = dataset_descriptor[ 'Region'] s3 = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs) store = s3fs.S3Map(root=path, s3=s3, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) ds = xr.open_zarr(cached_store) elif fs_type == 'local': if not os.path.isabs(path): path = os.path.join(self.base_dir, path) data_format = dataset_descriptor.get('Format', 'nc') if data_format == 'nc': ds = xr.open_dataset(path) elif data_format == 'zarr': ds = xr.open_zarr(path) else: print("Invalid format=" + data_format + "!r} in dataset descriptor") self._dataset_cache[name] = ds return ds return super().__getattribute__(name)
def open_ml_dataset_from_object_storage(path: str, data_format: str = None, ds_id: str = None, exception_type: type = ValueError, client_kwargs: Mapping[str, Any] = None, **kwargs) -> MultiLevelDataset: data_format = data_format or guess_ml_dataset_format(path) endpoint_url, root = split_bucket_url(path) if endpoint_url: kwargs['endpoint_url'] = endpoint_url path = root client_kwargs = dict(client_kwargs or {}) for arg_name in ['endpoint_url', 'region_name']: if arg_name in kwargs: client_kwargs[arg_name] = kwargs.pop(arg_name) obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs) if data_format == FORMAT_NAME_ZARR: store = s3fs.S3Map(root=path, s3=obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time(tag=f"opened remote zarr dataset {path}"): consolidated = obs_file_system.exists(f'{path}/.zmetadata') ds = assert_cube( xr.open_zarr(cached_store, consolidated=consolidated, **kwargs)) return BaseMultiLevelDataset(ds, ds_id=ds_id) elif data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened remote levels dataset {path}"): return ObjectStorageMultiLevelDataset( obs_file_system, path, zarr_kwargs=kwargs, ds_id=ds_id, exception_type=exception_type) raise exception_type( f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}' )
def __init__(self, store, cache_size=512 * (1024**2)): """ rss format data access. Parameters ---------- store - Instance of s ZArr storage object, see s3fs.S3Map for remote s3 storage, or zarr.DirectoryStore as common types of store. """ # don't cache meta-data read once self.root = zarr.open(store, mode="r") clear_output() print("Mounting line access.") cache = zarr.LRUStoreCache(store, max_size=cache_size) inline_root = zarr.open(cache, mode="r") self.inline_root = inline_root["inline"] crossline_root = zarr.open(cache, mode="r") self.crossline_root = crossline_root["crossline"] clear_output() print("Configuring meta-data.") self.bounds = self.root["bounds"] self.ilxl = np.vstack([ self.root["coords"]["inlines"][:], self.root["coords"]["crosslines"][:], ]).T self.xy = np.vstack( [self.root["coords"]["cdpx"][:], self.root["coords"]["cdpy"][:]]).T self.kdtree = None clear_output() print("Connection complete.")
def open_data(self, data_id: str, **open_params) -> xr.Dataset: assert_instance(data_id, str, name='data_id') fs, root, open_params = self.load_fs(open_params) zarr_store = fs.get_mapper(data_id) cache_size = open_params.pop('cache_size', None) if isinstance(cache_size, int) and cache_size > 0: zarr_store = zarr.LRUStoreCache(zarr_store, max_size=cache_size) log_access = open_params.pop('log_access', None) if log_access: zarr_store = LoggingStore(zarr_store, name=f'zarr_store({data_id!r})') consolidated = open_params.pop('consolidated', fs.exists(f'{data_id}/.zmetadata')) try: return xr.open_zarr(zarr_store, consolidated=consolidated, **open_params) except ValueError as e: raise DataStoreError(f'Failed to open' f' dataset {data_id!r}: {e}') from e
def open(self, mode: str = "r", cached: bool = True, cache_size_bytes: int = int(1e9)) -> None: """Opens a zarr dataset from disk from the path supplied in the constructor. Keyword Arguments: mode (str): Mode to open dataset in, default to read-only (default: {"r"}) cached (bool): Whether to cache files read from disk using a LRU cache. (default: {True}) cache_size (int): Size of cache in bytes (default: {1e9} (1GB)) Raises: Exception: When any of the expected arrays (frames, agents, scenes) is missing or the store couldn't be opened. """ if cached: self.root = zarr.open_group( store=zarr.LRUStoreCache(zarr.DirectoryStore(self.path), max_size=cache_size_bytes), mode=mode ) else: self.root = zarr.open_group(self.path, mode=mode) self.frames = self.root[FRAME_ARRAY_KEY] self.agents = self.root[AGENT_ARRAY_KEY] self.scenes = self.root[SCENE_ARRAY_KEY]
def __init__( self, zarr_dataset_path: str, cache_zarr: bool = False, with_history: bool = False, return_indices: bool = False, agents_from_standard_mask_only: bool = False, ): if cache_zarr: zarr_root = zarr.open_group( store=zarr.LRUStoreCache( zarr.DirectoryStore(zarr_dataset_path), max_size=int(1e9)), mode="r", ) else: zarr_root = zarr.open_group(zarr_dataset_path, mode="r") self.cumulative_sizes = zarr_root[SCENE_ARRAY_KEY][ "frame_index_interval"][:, 1] if with_history: raise NotImplementedError if agents_from_standard_mask_only: self.sample_function = partial( generate_frame_sample_without_hist, agents=zarr_root[AGENT_ARRAY_KEY], tl_faces=zarr_root[TL_FACE_ARRAY_KEY], agents_from_standard_mask_only=True, mask_agent_indices=zarr_root[MASK_AGENT_INDICES_ARRAY_KEY], ) else: self.sample_function = partial( generate_frame_sample_without_hist, agents=zarr_root[AGENT_ARRAY_KEY], tl_faces=zarr_root[TL_FACE_ARRAY_KEY], ) self.with_history = with_history self.return_indices = return_indices self.zarr_root = zarr_root
def open_cube(cube_config: CubeConfig, observer: Callable = None, trace_store_calls: bool = False, max_cache_size: int = 2 ** 30, **sh_kwargs) -> xr.Dataset: """ Open a data cube from SentinelHub. This is a facade function that hides the details of opening a volatile data cube from SentinelHub. :param cube_config: The cube configuration. :param observer: A observer function or callable that is called on every request made to SentinelHub. :param trace_store_calls: Whether to trace and dump calls made into the Zarr store. :param max_cache_size: Cache size in bytes. Defaults to 1 GB. If zero or None, no caching takes place: :param sh_kwargs: Keyword arguments passed to the SentinelHub constructor. :return: the data cube represented by an xarray Dataset object. """ sentinel_hub = SentinelHub(**sh_kwargs) cube_store = SentinelHubStore(sentinel_hub, cube_config, observer=observer, trace_store_calls=trace_store_calls) if max_cache_size: cube_store = zarr.LRUStoreCache(cube_store, max_cache_size) return xr.open_zarr(cube_store)
def open_ml_dataset_from_object_storage(path: str, data_format: str = None, ds_id: str = None, exception_type: type = ValueError, s3_kwargs: Mapping[str, Any] = None, s3_client_kwargs: Mapping[str, Any] = None, chunk_cache_capacity: int = None, **kwargs) -> MultiLevelDataset: data_format = data_format or guess_ml_dataset_format(path) s3, root = parse_s3_fs_and_root(path, s3_kwargs=s3_kwargs, s3_client_kwargs=s3_client_kwargs, mode='r') if data_format == FORMAT_NAME_ZARR: store = s3fs.S3Map(root=root, s3=s3, check=False) if chunk_cache_capacity: store = zarr.LRUStoreCache(store, max_size=chunk_cache_capacity) with measure_time(tag=f"opened remote zarr dataset {path}"): consolidated = s3.exists(f'{root}/.zmetadata') ds = assert_cube( xr.open_zarr(store, consolidated=consolidated, **kwargs)) return BaseMultiLevelDataset(ds, ds_id=ds_id) elif data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened remote levels dataset {path}"): return ObjectStorageMultiLevelDataset( s3, root, zarr_kwargs=kwargs, ds_id=ds_id, chunk_cache_capacity=chunk_cache_capacity, exception_type=exception_type) raise exception_type( f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}' )
import urllib.request import matplotlib.colors from .structure_graph import acronym_to_allen_id, allen_id_to_acronym, structure_graph, allen_id_to_tree_node from .allen_id_label import labels_for_allen_id from .swc_morphology import swc_morphology_geometry from .structure_mesh import structure_mesh from IPython.core.debugger import set_trace _image_fs = HTTPFileSystem() # Todo: Use AWS store after Scott / Lydia upload _image_store = _image_fs.get_mapper( "https://thewtex.github.io/allen-ccf-itk-vtk-zarr/average_template_50_chunked.zarr" ) _image_store_cached = zarr.LRUStoreCache(_image_store, max_size=None) _image_ds = xr.open_zarr(_image_store_cached, consolidated=True) _image_da = _image_ds.average_template_50 _label_image_fs = HTTPFileSystem() # Todo: Use AWS store after Scott / Lydia upload _label_image_store = _label_image_fs.get_mapper( "https://thewtex.github.io/allen-ccf-itk-vtk-zarr/allen_ccfv3_annotation_50_contiguous.zarr" ) _label_image_store_cached = zarr.LRUStoreCache(_label_image_store, max_size=None) _label_image_ds = xr.open_zarr(_label_image_store_cached, consolidated=True) _label_image_da = _label_image_ds.allen_ccfv3_annotation @register
def test_cached(self): store_cache = zarr.LRUStoreCache(self.store, max_size=2 * 24) cube = xr.open_zarr(store_cache) self.assert_4d_cube_is_valid(cube)
def get_dataset(self, ds_name: str) -> xr.Dataset: if ds_name in self.dataset_cache: ds, _, _ = self.dataset_cache[ds_name] else: dataset_descriptor = self.get_dataset_descriptor(ds_name) path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_name}") t1 = time.clock() fs_type = dataset_descriptor.get('FileSystem', 'local') if fs_type == 'obs': data_format = dataset_descriptor.get('Format', 'zarr') if data_format != 'zarr': raise ServiceConfigError( f"Invalid format={data_format!r} in dataset descriptor {ds_name!r}" ) client_kwargs = {} if 'Endpoint' in dataset_descriptor: client_kwargs['endpoint_url'] = dataset_descriptor[ 'Endpoint'] if 'Region' in dataset_descriptor: client_kwargs['region_name'] = dataset_descriptor['Region'] s3 = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs) store = s3fs.S3Map(root=path, s3=s3, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with log_time(f"opened remote dataset {path}"): ds = xr.open_zarr(cached_store) elif fs_type == 'local': if not os.path.isabs(path): path = os.path.join(self.base_dir, path) data_format = dataset_descriptor.get('Format', 'nc') if data_format == 'nc': with log_time(f"opened local NetCDF dataset {path}"): ds = xr.open_dataset(path) elif data_format == 'zarr': with log_time(f"opened local zarr dataset {path}"): ds = xr.open_zarr(path) else: raise ServiceConfigError( f"Invalid format={data_format!r} in dataset descriptor {ds_name!r}" ) elif fs_type == 'computed': if not os.path.isabs(path): path = os.path.join(self.base_dir, path) with open(path) as fp: python_code = fp.read() local_env = dict() global_env = None try: exec(python_code, global_env, local_env) except Exception as e: raise ServiceError( f"Failed to compute dataset {ds_name!r} from {path!r}: {e}" ) from e callable_name = dataset_descriptor.get('Function', COMPUTE_DATASET) callable_args = dataset_descriptor.get('Args', []) callable_obj = local_env.get(callable_name) if callable_obj is None: raise ServiceConfigError( f"Invalid dataset descriptor {ds_name!r}: " f"no callable named {callable_name!r} found in {path!r}" ) elif not callable(callable_obj): raise ServiceConfigError( f"Invalid dataset descriptor {ds_name!r}: " f"object {callable_name!r} in {path!r} is not callable" ) args = list() for arg_value in callable_args: if isinstance(arg_value, str) and len(arg_value) > 2 \ and arg_value.startswith('@') and arg_value.endswith('@'): ref_ds_name = arg_value[1:-1] if not self.get_dataset_descriptor(ref_ds_name): raise ServiceConfigError( f"Invalid dataset descriptor {ds_name!r}: " f"argument {arg_value!r} of callable {callable_name!r} " f"must reference another dataset") args.append(self.get_dataset(ref_ds_name)) else: args.append(arg_value) try: with log_time(f"created computed dataset {ds_name}"): ds = callable_obj(*args) except Exception as e: raise ServiceError( f"Failed to compute dataset {ds_name!r} " f"from function {callable_name!r} in {path!r}: {e}" ) from e if not isinstance(ds, xr.Dataset): raise ServiceError( f"Failed to compute dataset {ds_name!r} " f"from function {callable_name!r} in {path!r}: " f"expected an xarray.Dataset but got a {type(ds)}") else: raise ServiceConfigError( f"Invalid fs={fs_type!r} in dataset descriptor {ds_name!r}" ) tile_grid_cache = dict() self.dataset_cache[ ds_name] = ds, dataset_descriptor, tile_grid_cache t2 = time.clock() if TRACE_PERF: print(f'PERF: opening {ds_name!r} took {t2-t1} seconds') return ds
def __init__(self, filename: str, hdf5group: str = None, hdf5file_mode: str = 'r', store: Union[MutableMapping, str, Path] = None, store_path: str = None, store_mode: str = 'a', LRU: bool = False, LRU_max_size: int = 2**30, max_chunksize=2 * 2**20): """ Args: filename: str or File-like object, file name string or File-like object to be read by zarr hdf5group: str, hdf5 group in hdf5 file to be read by zarr along with its children. default is the root group. hdf5file_mode str, subset of h5py file access modes, filename must exist 'r' readonly, default 'r' 'r+' read and write store: collections.abc.MutableMapping or str, zarr store. if string path is passed, zarr.DirectoryStore is created at the given path, if None, zarr.MemoryStore is used store_mode: store data access mode, default 'a' 'r' readonly, compatible zarr hierarchy should already exist in the passed store 'r+' read and write, return error if file does not exist, for updating zarr hierarchy 'w' create store, remove data if it exists 'w-' or 'x' create store, fail if exists 'a' read and write, create if it does not exist, default 'r' store_path: string, path in zarr store LRU: bool, if store is not already zarr.LRUStoreCache, add a zarr.LRUStoreCache store layer on top of currently used store LRU_max_size: int, maximum zarr.LRUStoreCache cache size, only used if store is zarr.LRUStoreCache, or LRU argument is True max_chunksize: maximum chunk size to use when creating zarr hierarchy, this is useful if only a small slice of data needs to be read """ # Verify arguments if hdf5file_mode not in ('r', 'r+'): raise ValueError("hdf5file_mode must be 'r' or 'r+'") self.hdf5file_mode = hdf5file_mode # Verify arguments if not isinstance(LRU, bool): raise TypeError(f"Expected bool for LRU, recieved {type(LRU)}") self.LRU = LRU if not isinstance(LRU_max_size, int): raise TypeError( f"Expected int for LRU_max_size, recieved {type(LRU_max_size)}" ) self.LRU_max_size = LRU_max_size if not isinstance(max_chunksize, int): raise TypeError( f"Expected int for max_chunksize, recieved {type(max_chunksize)}" ) self.max_chunksize = max_chunksize # store, store_path, and store_mode are passed through to zarr self.store_path = store_path self.store_mode = store_mode if store is not None and LRU is True and not isinstance( store, zarr.LRUStoreCache): self.store = zarr.LRUStoreCache(store, max_size=self.LRU_max_size) else: self.store = store # create dictionary mapping hdf5 filter numbers to compatible zarr codec self._hdf5_regfilters_subset = {} self._fill_regfilters() # dictionary to hold addresses of hdf5 objects in file self._address_dict = {} # create zarr format hierarchy for datasets and attributes compatible with hdf5 file, # dataset contents are not copied, unless it contains variable-length strings self.zgroup = zarr.open_group(self.store, mode=self.store_mode, path=self.store_path) if self.store is None: self.store = self.zgroup.store # FileChunkStore requires uri if isinstance(filename, str): self.uri = filename else: try: self.uri = getattr(filename, 'path', None) if self.uri is None: self.uri = filename.name except: self.uri = '' # Access hdf5 file and create zarr hierarchy if hdf5group is not None and not isinstance(hdf5group, str): raise TypeError( f"Expected str for hdf5group, recieved {type(hdf5group)}") self.hdf5group = hdf5group self.filename = filename if self.store_mode != 'r': self.file = h5py.File(self.filename, mode=self.hdf5file_mode) self.group = self.file[ self.hdf5group] if self.hdf5group is not None else self.file self.create_zarr_hierarchy(self.group, self.zgroup) self.file.close() if isinstance(self.filename, str): self.chunkstore_file = fsspec.open(self.filename, mode='rb') self.chunk_store = FileChunkStore( self.store, chunk_source=self.chunkstore_file.open()) else: self.chunk_store = FileChunkStore(self.store, chunk_source=self.filename) if LRU is True and not isinstance(self.chunk_store, zarr.LRUStoreCache): self.chunk_store = zarr.LRUStoreCache(self.chunk_store, max_size=self.LRU_max_size) # open zarr group store_mode_cons = 'r' if self.store_mode == 'r' else 'r+' self.zgroup = zarr.open_group(self.store, mode=store_mode_cons, path=self.store_path, chunk_store=self.chunk_store)
def _create_dataset_entry( self, ds_id: str) -> Tuple[MultiLevelDataset, Dict[str, Any]]: dataset_descriptor = self.get_dataset_descriptor(ds_id) path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_id}") t1 = time.perf_counter() fs_type = dataset_descriptor.get('FileSystem', 'local') if fs_type == 'obs': data_format = dataset_descriptor.get('Format', 'zarr') s3_client_kwargs = {} if 'Endpoint' in dataset_descriptor: s3_client_kwargs['endpoint_url'] = dataset_descriptor[ 'Endpoint'] if 'Region' in dataset_descriptor: s3_client_kwargs['region_name'] = dataset_descriptor['Region'] obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=s3_client_kwargs) if data_format == 'zarr': store = s3fs.S3Map(root=path, s3=obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time(tag=f"opened remote zarr dataset {path}"): ds = xr.open_zarr(cached_store) ml_dataset = BaseMultiLevelDataset(ds) elif data_format == 'levels': with measure_time(tag=f"opened remote levels dataset {path}"): ml_dataset = ObjectStorageMultiLevelDataset( ds_id, obs_file_system, path, exception_type=ServiceConfigError) else: raise ServiceConfigError( f"Invalid format={data_format!r} in dataset descriptor {ds_id!r}" ) elif fs_type == 'local': if not os.path.isabs(path): path = os.path.join(self.base_dir, path) data_format = dataset_descriptor.get('Format', 'nc') if data_format == 'nc': with measure_time(tag=f"opened local NetCDF dataset {path}"): ds = xr.open_dataset(path) ml_dataset = BaseMultiLevelDataset(ds) elif data_format == 'zarr': with measure_time(tag=f"opened local zarr dataset {path}"): ds = xr.open_zarr(path) ml_dataset = BaseMultiLevelDataset(ds) elif data_format == 'levels': with measure_time(tag=f"opened local levels dataset {path}"): ml_dataset = FileStorageMultiLevelDataset(path) else: raise ServiceConfigError( f"Invalid format={data_format!r} in dataset descriptor {ds_id!r}" ) elif fs_type == 'memory': if not os.path.isabs(path): path = os.path.join(self.base_dir, path) callable_name = dataset_descriptor.get('Function', COMPUTE_DATASET) input_dataset_ids = dataset_descriptor.get('InputDatasets', []) input_parameters = dataset_descriptor.get('InputParameters', {}) for input_dataset_id in input_dataset_ids: if not self.get_dataset_descriptor(input_dataset_id): raise ServiceConfigError( f"Invalid dataset descriptor {ds_id!r}: " f"Input dataset {input_dataset_id!r} of callable {callable_name!r} " f"must reference another dataset") with measure_time(tag=f"opened memory dataset {path}"): ml_dataset = ComputedMultiLevelDataset( ds_id, path, callable_name, input_dataset_ids, self.get_ml_dataset, input_parameters, exception_type=ServiceConfigError) else: raise ServiceConfigError( f"Invalid fs={fs_type!r} in dataset descriptor {ds_id!r}") t2 = time.perf_counter() if self.config.get("trace_perf", False): _LOG.info(f'Opening {ds_id!r} took {t2 - t1} seconds') return ml_dataset, dataset_descriptor