def consolidate_metadata(target): """ Consolidate Zarr metadata Parameters ---------- target : str Path or url of the Zarr store. """ mapper = fsspec.get_mapper(target) zarr.consolidate_metadata(mapper)
def fetch_zarr(zarr_url, storage_options={'anon': True}): zg = zarr.open_consolidated(fsspec.get_mapper(zarr_url, **storage_options), mode='r') dimensions = {} variable_arrays = {} for k, a in zg.arrays(): if k in a.attrs['_ARRAY_DIMENSIONS']: dimensions[k] = a.attrs['_ARRAY_DIMENSIONS'] else: variable_arrays[k] = a.attrs['_ARRAY_DIMENSIONS'] return zg, dimensions, variable_arrays
def _set_file_path(path): """Find the speakers.json under the given path or the above it. Intended to band aid the different paths returned in restored and continued training.""" path_restore = os.path.join(os.path.dirname(path), "speakers.json") path_continue = os.path.join(path, "speakers.json") fs = fsspec.get_mapper(path).fs if fs.exists(path_restore): return path_restore if fs.exists(path_continue): return path_continue raise FileNotFoundError(f" [!] `speakers.json` not found in {path}")
def open_dsets(df): """Open datasets from cloud storage and return xarray dataset.""" dsets = [ xr.open_zarr(fsspec.get_mapper(ds_url), consolidated=True).pipe(drop_all_bounds) for ds_url in df.zstore ] try: ds = xr.merge(dsets, join='exact') return ds except ValueError: return None
def nc2zarr(source_url, cache_location): """convert netcdf data to zarr""" target_url = source_url + ".zarr" with dask.config.set(scheduler="single-threaded"): ds = (xr.open_dataset(fsspec.open(source_url).open()).pipe( preproc).pipe(postproc).load().chunk(chunks)) mapper = fsspec.get_mapper(target_url) ds.to_zarr(mapper) return target_url
def split_and_write(model, scenario, member, method): ds = get_scratch_ds(model, scenario, member, method) scen_mapper = fsspec.get_mapper( f'az://carbonplan-downscaling/cmip6/{method}/conus/4000m/monthly/{model}.{scenario}.{member}.zarr', account_name="carbonplan", account_key=account_key, ) clean_store(scen_mapper) print('writing scen') write_zarr(ds.sel(time=slice('2015-01', None)), scen_mapper) hist_mapper = fsspec.get_mapper( f'az://carbonplan-downscaling/cmip6/{method}/conus/4000m/monthly/{model}.historical.{member}.zarr', account_name="carbonplan", account_key=account_key, ) clean_store(hist_mapper) print('writing hist') write_zarr(ds.sel(time=slice(None, '2014-12')), hist_mapper)
def test_binary_table(): out = kerchunk.fits.process_file(btable, extension=1) m = fsspec.get_mapper("reference://", fo=out) z = zarr.open(m) arr = z["1"] with open(btable, "rb") as f: hdul = fits.open(f) attr2 = dict(arr.attrs) assert attr2.pop('_ARRAY_DIMENSIONS') == ['x'] assert attr2 == dict(hdul[1].header) assert (arr['order'] == hdul[1].data['order']).all() assert (arr['mag'] == hdul[1].data['mag']).all() assert (arr['name'].astype("U") == hdul[1].data['name']).all() # string come out as bytes
def test_open_asset_preprocess_error(): path = os.path.join( here, './sample_data/cesm-le/b.e11.B1850C5CN.f09_g16.005.pop.h.SHF.040001-049912.nc' ) print(path) path = f'file://{path}' mapper = fsspec.get_mapper(path) def preprocess(ds): return ds.set_coords('foo') with pytest.raises(RuntimeError): _open_asset(mapper, 'netcdf', cdf_kwargs={}, varname=['SHF'], preprocess=preprocess)
def consolidate_metadata(writes: List[str], target: str) -> None: """ Consolidate the metadata the Zarr group at `target`. Parameters ---------- writes : List[str] The URLs the combined stores were written to. This is only a parameter to introduce a dependency. The actual value isn't used. target : str The URL for the (combined) Zarr group. """ mapper = fsspec.get_mapper(target) zarr.consolidate_metadata(mapper)
def load_dataset(path: str, unpack: bool = False, consolidated: bool = False) -> Dataset: store = fsspec.get_mapper(path, check=False, create=False) ds = xr.open_zarr(store, concat_characters=False, consolidated=consolidated) if unpack: ds = unpack_variables(ds, dtype="float16") for v in ds: # Workaround for https://github.com/pydata/xarray/issues/4386 if v.endswith("_mask"): ds[v] = ds[v].astype(bool) return ds
def get_scratch_ds(model, scenario, member, method): print(f'loading {model}.{scenario}.{member}') mapper = fsspec.get_mapper( f'az://carbonplan-scratch/cmip6/{method}/conus/4000m/monthly/{model}.{scenario}.{member}.zarr', account_name="carbonplan", ) ds = xr.open_zarr(mapper, consolidated=True)[ cp_vars ] # .load(retries=task_retries).chunk(chunks) print(f'ds size: {ds.nbytes / 1e9}') return ds
def test_fsspec_get_mapper(): """Added for #788""" with tempzip(archive_data) as z: mapping = fsspec.get_mapper(f"zip::{z}") assert isinstance(mapping, collections.abc.Mapping) keys = sorted(list(mapping.keys())) assert keys == ["a", "b", "deeply/nested/path"] # mapping.getitems() will call FSMap.fs.cat() # which was not accurately implemented for zip. assert isinstance(mapping, fsspec.mapping.FSMap) items = dict(mapping.getitems(keys)) assert items == {"a": b"", "b": b"hello", "deeply/nested/path": b"stuff"}
def test_mapping_prefix(tmpdir): tmpdir = str(tmpdir) os.makedirs(os.path.join(tmpdir, "afolder")) open(os.path.join(tmpdir, "afile"), "w").write("test") open(os.path.join(tmpdir, "afolder", "anotherfile"), "w").write("test2") m = fsspec.get_mapper("file://" + tmpdir) assert "afile" in m assert m["afolder/anotherfile"] == b"test2" fs = fsspec.filesystem("file") m2 = fs.get_mapper(tmpdir) m3 = fs.get_mapper("file://" + tmpdir) assert m == m2 == m3
def test_mapping_prefix(tmpdir): tmpdir = str(tmpdir) os.makedirs(os.path.join(tmpdir, 'afolder')) open(os.path.join(tmpdir, 'afile'), 'w').write('test') open(os.path.join(tmpdir, 'afolder', 'anotherfile'), 'w').write('test2') m = fsspec.get_mapper('file://' + tmpdir) assert 'afile' in m assert m['afolder/anotherfile'] == b'test2' fs = fsspec.filesystem('file') m2 = fs.get_mapper(tmpdir) m3 = fs.get_mapper('file://' + tmpdir) assert m == m2 == m3
def map_tgt(tgt: str) -> fsspec.FSMap: """Uses fsspec to creating mapped object from target connection string Parameters ---------- tgt : str path store Returns ------- fsspec.FSMap fsspec mapped object """ tgt_map = fsspec.get_mapper(tgt, connection_string) return tgt_map
async def refresh_dataset(request: Request, dataset_id: str): if dataset_id: if dataset_id in DATASETS_STORE: logger.info(f"Refreshing dataset: {dataset_id}.") ds = xr.open_zarr( fsspec.get_mapper(DATASETS_STORE[dataset_id].zarr_url), consolidated=True, ) DATASETS_STORE[dataset_id].set_ds(ds) for r in request.app.routes: if r.path == f"/{dataset_id}": request.app.routes.remove(r) request.app.mount(f"/{dataset_id}", DATASETS_STORE[dataset_id].app) logger.info(f"Refresh completed: {dataset_id}.") return {"status": "success", "dataset_id": dataset_id}
def _open_dataset(self): import xarray as xr import fsspec assert fsspec.__version__ >= "0.3.6", "zarr plugin requires fsspec >= 0.3.6" from fsspec import filesystem, get_mapper from fsspec.utils import update_storage_options, infer_storage_options storage_options = infer_storage_options(self.urlpath) update_storage_options(storage_options, self.storage_options) self._fs = filesystem(storage_options['protocol']) if storage_options['protocol'] != 'file': self._mapper = get_mapper(self.urlpath) self._ds = xr.open_zarr(self._mapper, **self.kwargs) else: self._ds = xr.open_zarr(self.urlpath, **self.kwargs)
def consolidate_metadata(target, writes: Optional[List[str]] = None) -> None: """ Consolidate the metadata the Zarr group at `target`. Parameters ---------- target : str The URL for the (combined) Zarr group. writes : list of strings, optional The URLs the combined stores were written to. This is only a parameter to introduce a dependency in the pipeline execution graph. The actual value isn't used. """ mapper = fsspec.get_mapper(target) zarr.consolidate_metadata(mapper)
def _to_zarr( # type: ignore[no-untyped-def] arr, url, component=None, storage_options=None, overwrite=False, compute=True, return_stored=False, attrs=None, **kwargs, ): """Extension of dask.array.core.to_zarr that can set attributes on the resulting Zarr array, in the same Dask operation. """ # call Dask version with compute=False just to check preconditions da.to_zarr( arr, url, component=component, storage_options=storage_options, overwrite=overwrite, compute=False, return_stored=return_stored, **kwargs, ) storage_options = storage_options or {} if isinstance(url, str): mapper = get_mapper(url, **storage_options) else: # assume the object passed is already a mapper mapper = url # pragma: no cover chunks = [c[0] for c in arr.chunks] z = dask.delayed(_zarr_create_with_attrs)( shape=arr.shape, chunks=chunks, dtype=arr.dtype, store=mapper, path=component, overwrite=overwrite, attrs=attrs, **kwargs, ) return arr.store(z, lock=False, compute=compute, return_stored=return_stored)
def get_version(zstore, method='fsspec'): client = requests.session() baseurl = 'http://hdl.handle.net/api/handles/' query1 = '?type=IS_PART_OF' query2 = '?type=VERSION_NUMBER' # get the `netcdf_tracking_ids` from the zstore metadata if method == 'fsspec': mapper = fsspec.get_mapper(zstore) else: mapper = zstore group = zarr.open_consolidated(mapper) tracking_ids = group.attrs['tracking_id'] # query the dataset handler to obtain `dataset_tracking_id` and `version` versions = [] datasets = [] for file_tracking_id in tracking_ids.split('\n')[0:1]: url = baseurl + file_tracking_id[4:] + query1 r = client.get(url) r.raise_for_status() dataset_tracking_id = r.json()['values'][0]['data']['value'] datasets += [dataset_tracking_id] if ';' in dataset_tracking_id: # multiple dataset_ids erroneously reported dtracks = dataset_tracking_id.split(';') vs = [] for dtrack in dtracks: url2 = baseurl + dtrack[4:] + query2 r = client.get(url2) r.raise_for_status() r.json()['values'][0]['data']['value'] vs += [r.json()['values'][0]['data']['value']] v = sorted(vs)[-1] else: url2 = baseurl + dataset_tracking_id[4:] + query2 r = client.get(url2) r.raise_for_status() v = r.json()['values'][0]['data']['value'] versions += [v] version_id = list(set(versions)) dataset_id = list(set(datasets)) assert len(version_id) == 1 return dataset_id[0], version_id[0]
def test_no_dircache(s3): from s3fs.tests.test_s3fs import endpoint_uri import fsspec d = fsspec.get_mapper( "s3://" + root, anon=False, client_kwargs={"endpoint_url": endpoint_uri}, use_listings_cache=False, ) d.clear() assert list(d) == [] d[1] = b"1" assert list(d) == ["1"] d.clear() assert list(d) == []
def map_and_open_zarr_link(file_loc_str: str) -> xr.Dataset: """Takes zarr store, opens with fsspec and returns xarray dataset Parameters ---------- file_loc_str : str zarr store target path Returns ------- xr.Dataset output xarray dataset """ mapped_key = fsspec.get_mapper(file_loc_str, anon=True) ds = xr.open_zarr(mapped_key, consolidated=True) return ds
def to_zarr(input_path: str, output_path: str, dictionary_path: str): import dask.dataframe as dd import fsspec import xarray as xr from dask.diagnostics import ProgressBar logger.info(f"Converting parquet at {input_path} to {output_path}") df = dd.read_parquet(input_path) trait_columns = df.columns[df.columns.to_series().str.match(r"^\d+")] # 41210_Z942 -> 41210 (UKB field id) trait_group_ids = [c.split("_")[0] for c in trait_columns] # 41210_Z942 -> Z942 (Data coding value as one-hot encoding in phenotype, e.g.) trait_code_ids = ["_".join(c.split("_")[1:]) for c in trait_columns] trait_values = df[trait_columns].astype("float").to_dask_array() trait_values.compute_chunk_sizes() trait_id_to_name = ( pd.read_csv( dictionary_path, sep=",", usecols=["FieldID", "Field"], dtype={"FieldID": str, "Field": str}, ) .set_index("FieldID")["Field"] .to_dict() ) trait_name = [trait_id_to_name.get(v) for v in trait_group_ids] ds = xr.Dataset( dict( id=("samples", np.asarray(df["userId"], dtype=int)), trait=(("samples", "traits"), trait_values), trait_id=("traits", np.asarray(trait_columns.values, dtype=str)), trait_group_id=("traits", np.array(trait_group_ids, dtype=int)), trait_code_id=("traits", np.array(trait_code_ids, dtype=str)), trait_name=("traits", np.array(trait_name, dtype=str)), ) ) # Keep chunks small in trait dimension for faster per-trait processing ds["trait"] = ds["trait"].chunk(dict(samples="auto", traits=100)) ds = ds.rename_vars({v: f"sample_{v}" for v in ds}) logger.info(f"Saving dataset to {output_path}:\n{ds}") with ProgressBar(): ds.to_zarr(fsspec.get_mapper(output_path), consolidated=True, mode="w") logger.info("Done")
def test_setitem_types(): import array m = fsspec.get_mapper("memory://") m["a"] = array.array("i", [1]) if sys.byteorder == "little": assert m["a"] == b"\x01\x00\x00\x00" else: assert m["a"] == b"\x00\x00\x00\x01" m["b"] = bytearray(b"123") assert m["b"] == b"123" m.setitems({"c": array.array("i", [1]), "d": bytearray(b"123")}) if sys.byteorder == "little": assert m["c"] == b"\x01\x00\x00\x00" else: assert m["c"] == b"\x00\x00\x00\x01" assert m["d"] == b"123"
def test_missing_nonasync(m): zarr = pytest.importorskip("zarr") zarray = { "chunks": [1], "compressor": None, "dtype": "<f8", "fill_value": "NaN", "filters": [], "order": "C", "shape": [10], "zarr_format": 2, } refs = {".zarray": json.dumps(zarray)} m = fsspec.get_mapper("reference://", fo=refs, remote_protocol="memory") a = zarr.open_array(m) assert str(a[0]) == "nan"
def test_ops(): MemoryFileSystem.store.clear() m = fsspec.get_mapper('memory://') assert not m assert list(m) == [] with pytest.raises(KeyError): m['hi'] assert m.pop('key', 0) == 0 m['key0'] = b'data' assert list(m) == ['key0'] assert m['key0'] == b'data' m.clear() assert list(m) == []
def test_ops(): MemoryFileSystem.store.clear() m = fsspec.get_mapper("memory://") assert not m assert list(m) == [] with pytest.raises(KeyError): m["hi"] assert m.pop("key", 0) == 0 m["key0"] = b"data" assert list(m) == ["key0"] assert m["key0"] == b"data" m.clear() assert list(m) == []
def combine_and_write(sources, target, append_dim, first=True): # while debugging this, I had itermittent fsspec / hdf5 read errors related to # "trying to read from a closed file" # but they seem to have gone away for now double_open_files = [fsspec.open(url).open() for url in sources] ds = xr.open_mfdataset(double_open_files, combine="nested", concat_dim=concat_dim) # by definition, this should be a contiguous chunk ds = ds.chunk({append_dim: len(sources)}) if first: kwargs = dict(mode="w") else: kwargs = dict(mode="a", append_dim=append_dim) mapper = fsspec.get_mapper(target) ds.to_zarr(mapper, **kwargs)
def get_obs_std(obs, train_period_start, train_period_end, variables, gcm_grid_spec, ds=None): # if std is not already saved, ds must be a valid dataset path = make_coarse_obs_path( obs=obs, train_period_start=train_period_start, train_period_end=train_period_end, variables=variables, gcm_grid_spec=gcm_grid_spec, chunking_approach='std', ) store = fsspec.get_mapper(intermediate_cache_path + '/' + path) if '.zmetadata' not in store: std = ds.std(dim='time') std.to_zarr(store, mode="w", consolidated=True) else: std = xr.open_zarr(store).load() return std
def get_gwas_sumstat_manifest(path: str) -> pd.DataFrame: store = fsspec.get_mapper(path) df = [] for f in list(store): fn = f.split("/")[-1] parts = re.findall(r"ukb_chr(\d+)_(\d+)_(.*).parquet", fn) if not parts: continue parts = parts[0] df.append( dict( contig=parts[0], batch=int(parts[1]), trait_id=parts[2], trait_group_id=parts[2].split("_")[0], trait_code_id="_".join(parts[2].split("_")[1:]), file=f, )) return pd.DataFrame(df)