def _metadata_is_consolidated(m: fsspec.FSMap) -> bool: try: zarr.open_consolidated(m) consolidated = True except KeyError: # group with un-consolidated metadata, or array consolidated = False return consolidated
def test_zarr_array_to_parquet_table(dataset): """ Test converting from a zarr array to a parquet table, specifying a list of variables to store and setting 'snappy' compression. """ with tempfile.TemporaryDirectory() as tmpdir: zarrstore: str = os.path.join(tmpdir, "temp.zarr") dataset.to_zarr(store=zarrstore, consolidated=True) zarrarray: zarr.hierarchy.Group = zarr.open_consolidated(store=zarrstore) parquetpath: str = os.path.join(tmpdir, "temp.parquet") ndarray_to_parquet( ndarray=zarrarray, parquetpath=parquetpath, variables=["longitude", "latitude", "h_corr", "delta_time"], compression="snappy", ) df: dask.dataframe.core.DataFrame = dask.dataframe.read_parquet( path=parquetpath ) assert len(df) == 1404 assert list(df.columns) == [ "longitude", "latitude", "h_corr_1", "h_corr_2", "delta_time_1", "delta_time_2", ] assert all(np.issubdtype(dtype, np.float64) for dtype in df.dtypes)
def _open_snp_sites(self): if self._cache_snp_sites is None: path = f"{self.path}/v3/snp_genotypes/all/sites/" store = SafeStore(self.fs.get_mapper(path)) root = zarr.open_consolidated(store=store) self._cache_snp_sites = root return self._cache_snp_sites
def get_tracking_ids(zstore): """given a GC zarr location, fetch the associated dataset/netCDF tracking IDs""" from requests import session from zarr import open_consolidated from fsspec import get_mapper # request params client = session() base_url = 'http://hdl.handle.net/api/handles/' dset_id_query = '?type=IS_PART_OF' version_query = '?type=VERSION_NUMBER' # get primary tracking id netcdf_tracking_ids = open_consolidated( get_mapper(zstore)).attrs['tracking_id'].split('\n') file_tracking_id = netcdf_tracking_ids[0] version_ids = [] dataset_ids = [] # query for dataset_tracking_id dset_id_url = base_url + file_tracking_id[4:] + dset_id_query r = client.get(dset_id_url) r.raise_for_status() dataset_tracking_id = r.json()['values'][0]['data']['value'] # query for version if ';' in dataset_tracking_id: # multiple dataset_ids erroneously reported dataset_tracking_id = "ambiguous" return dataset_tracking_id, netcdf_tracking_ids
def open_group(cls, store, mode='r', synchronizer=None, group=None, consolidated=False, consolidate_on_close=False): import zarr min_zarr = '2.2' if LooseVersion(zarr.__version__) < min_zarr: # pragma: no cover raise NotImplementedError("Zarr version %s or greater is " "required by xarray. See zarr " "installation " "http://zarr.readthedocs.io/en/stable/" "#installation" % min_zarr) if consolidated or consolidate_on_close: if LooseVersion( zarr.__version__) <= '2.2.1.dev2': # pragma: no cover raise NotImplementedError("Zarr version 2.2.1.dev2 or greater " "is required by for consolidated " "metadata.") open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) if consolidated: # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) return cls(zarr_group, consolidate_on_close)
def open_group( cls, store, mode="r", synchronizer=None, group=None, consolidated=False, consolidate_on_close=False, chunk_store=None, append_dim=None, write_region=None, ): import zarr # zarr doesn't support pathlib.Path objects yet. zarr-python#601 if isinstance(store, pathlib.Path): store = os.fspath(store) open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) if chunk_store: open_kwargs["chunk_store"] = chunk_store if consolidated: # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) return cls(zarr_group, consolidate_on_close, append_dim, write_region)
def get_schema(url, coords): mapper = fsspec.get_mapper(url) group = zarr.open_consolidated(mapper) schema = synth.read_schema_from_zarr(group, coords) return schema
def _open_zarr(self): fmap = fsspec.get_mapper( f's3://{self.bucket_name}/{self.dataset_id}', **self.storage_options, ) self._zarr_group = zarr.open_consolidated(fmap) self._total_size = np.sum( [arr.nbytes for _, arr in self._zarr_group.items()]) self._total_size_repr = memory_repr(self._total_size)
def consolidate_metadata(self, metadata_key='.zmetadata'): ''' Wrapper over zarr.consolidate_metadata to pass chunk store when opening the zarr store ''' zarr.consolidate_metadata(self.store, metadata_key=metadata_key) store_mode_cons = 'r' if self.store_mode == 'r' else 'r+' self.zgroup = zarr.open_consolidated(self.store, metadata_key=metadata_key, mode=store_mode_cons, chunk_store=self.zgroup.chunk_store, path=self.store_path) return self.zgroup
def _open_snp_genotypes(self, *, sample_set): try: return self._cache_snp_genotypes[sample_set] except KeyError: release = self._lookup_release(sample_set=sample_set) path = f"{self.path}/{release}/snp_genotypes/all/{sample_set}/" store = SafeStore(self.fs.get_mapper(path)) root = zarr.open_consolidated(store=store) self._cache_snp_genotypes[sample_set] = root return root
def _open_site_filters(self, *, mask, analysis): key = mask, analysis try: return self._cache_site_filters[key] except KeyError: path = f"{self.path}/v3/site_filters/{analysis}/{mask}/" store = SafeStore(self.fs.get_mapper(path)) root = zarr.open_consolidated(store=store) self._cache_site_filters[key] = root return root
def fetch_zarr(zarr_url, storage_options={'anon': True}): zg = zarr.open_consolidated(fsspec.get_mapper(zarr_url, **storage_options), mode='r') dimensions = {} variable_arrays = {} for k, a in zg.arrays(): if k in a.attrs['_ARRAY_DIMENSIONS']: dimensions[k] = a.attrs['_ARRAY_DIMENSIONS'] else: variable_arrays[k] = a.attrs['_ARRAY_DIMENSIONS'] return zg, dimensions, variable_arrays
def _open_with_xarray_or_zarr( m: fsspec.FSMap, consolidated: bool ) -> Tuple[Union[xr.Dataset, zarr.hierarchy.Group, zarr.core.Array], bool]: try: result = xr.open_zarr(m, consolidated=consolidated) is_xarray_dataset = True except KeyError: # xarray requires _ARRAY_DIMENSIONS attribute, assuming missing if KeyError result = zarr.open_consolidated(m) if consolidated else zarr.open(m) is_xarray_dataset = False return result, is_xarray_dataset
def open_mask_group(self): """Open the zarr group that contains the masks Returns ------- mask_group : zarr.Group """ mapper = self.mask_fs.get_mapper(self.mask_path) zgroup = zarr.open_consolidated(mapper) return zgroup
def _load(self): import zarr if self._grp is None: # obtain the zarr root group if isinstance(self._urlpath, zarr.hierarchy.Group): # use already-opened group, allows support for nested groups # as catalogs root = self._urlpath else: # obtain store if isinstance(self._urlpath, str): # open store from url from fsspec import get_mapper store = get_mapper(self._urlpath, **self._storage_options) else: # assume store passed directly store = self._urlpath # open root group if self._consolidated: # use consolidated metadata root = zarr.open_consolidated(store=store, mode='r') else: root = zarr.open_group(store=store, mode='r') # deal with component path if self._component is None: self._grp = root else: self._grp = root[self._component] # use zarr attributes as metadata self.metadata.update(self._grp.attrs.asdict()) # build catalog entries entries = {} for k, v in self._grp.items(): if isinstance(v, zarr.core.Array): entry = LocalCatalogEntry(name=k, description='', driver='ndzarr', args=dict(urlpath=v), catalog=self) else: entry = LocalCatalogEntry(name=k, description='', driver='zarr_cat', args=dict(urlpath=v)) entries[k] = entry self._entries = entries
def structure_mesh(allen_id): if allen_id in _cache: return _cache[allen_id] fs = HTTPFileSystem() # Todo: Use AWS store after Scott / Lydia upload store = fs.get_mapper( "https://thewtex.github.io/allen-ccf-itk-vtk-zarr/meshes/{0}.zarr". format(allen_id)) root = zarr.open_consolidated(store) mesh = zarr_to_vtkjs(root) _cache[allen_id] = mesh return mesh
def get_version(zstore, method='fsspec'): client = requests.session() baseurl = 'http://hdl.handle.net/api/handles/' query1 = '?type=IS_PART_OF' query2 = '?type=VERSION_NUMBER' # get the `netcdf_tracking_ids` from the zstore metadata if method == 'fsspec': mapper = fsspec.get_mapper(zstore) else: mapper = zstore group = zarr.open_consolidated(mapper) tracking_ids = group.attrs['tracking_id'] # query the dataset handler to obtain `dataset_tracking_id` and `version` versions = [] datasets = [] for file_tracking_id in tracking_ids.split('\n')[0:1]: url = baseurl + file_tracking_id[4:] + query1 r = client.get(url) r.raise_for_status() dataset_tracking_id = r.json()['values'][0]['data']['value'] datasets += [dataset_tracking_id] if ';' in dataset_tracking_id: # multiple dataset_ids erroneously reported dtracks = dataset_tracking_id.split(';') vs = [] for dtrack in dtracks: url2 = baseurl + dtrack[4:] + query2 r = client.get(url2) r.raise_for_status() r.json()['values'][0]['data']['value'] vs += [r.json()['values'][0]['data']['value']] v = sorted(vs)[-1] else: url2 = baseurl + dataset_tracking_id[4:] + query2 r = client.get(url2) r.raise_for_status() v = r.json()['values'][0]['data']['value'] versions += [v] version_id = list(set(versions)) dataset_id = list(set(datasets)) assert len(version_id) == 1 return dataset_id[0], version_id[0]
def open_group( cls, store, mode="r", synchronizer=None, group=None, consolidated=False, consolidate_on_close=False, ): import zarr open_kwargs = dict(mode=mode, synchronizer=synchronizer, path=group) if consolidated: # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) return cls(zarr_group, consolidate_on_close)
def open_group( cls, store, mode="r", synchronizer=None, group=None, consolidated=False, consolidate_on_close=False, chunk_store=None, storage_options=None, append_dim=None, write_region=None, safe_chunks=True, ): # zarr doesn't support pathlib.Path objects yet. zarr-python#601 if isinstance(store, pathlib.Path): store = os.fspath(store) open_kwargs = dict( mode=mode, synchronizer=synchronizer, path=group, ) if LooseVersion(zarr.__version__) >= "2.5.0": open_kwargs["storage_options"] = storage_options elif storage_options: raise ValueError( "Storage options only compatible with zarr>=2.5.0") if chunk_store: open_kwargs["chunk_store"] = chunk_store if consolidated: # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) return cls(zarr_group, consolidate_on_close, append_dim, write_region, safe_chunks)
def append_zarr_along_time(source_path: str, target_path: str, fs: fsspec.AbstractFileSystem, dim: str = "time"): """Append local zarr store at source_path to zarr store at target_path along time. Args: source_path: Local path to zarr store that represents an xarray dataset. target_path: Local or remote url for zarr store to be appended to. fs: Filesystem for target_path. dim: (optional) name of time dimension. Defaults to "time". Raises: ValueError: If the chunk size in time does not evenly divide length of time dimension for zarr stores at source_path. Warning: The zarr store as source_path will be modified in place. """ merged_time = _get_merged_time_coordinate(source_path, target_path, dim, fs) if fs.exists(target_path): source_store = zarr.open(source_path, mode="r+") target_store = zarr.open_consolidated(fsspec.get_mapper(target_path)) _assert_chunks_match(source_store, target_store, dim) _set_time_units_like(source_store, target_store) _shift_store(source_store, dim, _get_dim_size(target_store, dim)) elif fs.protocol == "file": os.makedirs(target_path) upload_dir(source_path, target_path) _overwrite_time_array_with_single_chunk(target_path, merged_time, dim) _, _, absolute_target_paths = fsspec.get_fs_token_paths(target_path) consolidate_metadata(fs, absolute_target_paths[0])
def update_slice(store: Union[str, MutableMapping], insert_index: int, dataslice: xr.Dataset, mode: str, dimension: str = "time") -> None: """ Update existing Zarr dataset with new data slice. :param store: A Zarr store. :param insert_index: index at which to insert :param dataslice: slice to insert :param mode: Update mode, 'insert' or 'replace' :param dimension: name of dimension perpendicular to slice """ if mode not in ('insert', 'replace'): raise ValueError(f'illegal mode value: {mode!r}') insert_mode = mode == 'insert' append_dim_var_names = [] encoding = {} # Neither Zarr nor xarray offer an explicit API function to check whether # a Zarr is consolidated. Here we use the workaround of attempting to # open as consolidated, and catching the resulting exception if this # isn't possible. In the case of a consolidated Zarr, there is a slight # inefficiency, since the consolidated metadata object is fetched twice # (by Zarr and thereafter by xarray). See comments on PR #48 for # discussion of possible optimizations. consolidated = True try: _ = zarr.open_consolidated(store) except KeyError: consolidated = False with xr.open_zarr(store, consolidated=consolidated) as ds: for var_name in ds.variables: var = ds[var_name] if var.ndim >= 1 and dimension in var.dims: if var.dims[0] != dimension: # TODO: Remove this restriction -- it's not fundamentally # necessary. Removal should be accompanied by appropriate # unit tests and the addition of a warning to the user # about potential slowness / inefficiency. raise ValueError(f"dimension '{dimension}' of variable " f"{var_name!r} must be first dimension") append_dim_var_names.append(var_name) enc = dict(ds[var_name].encoding) # xarray 0.17+ supports engine preferred chunks if exposed by # the backend zarr does that, but when we use the new # 'preferred_chunks' when writing to zarr it raises and says, # 'preferred_chunks' is an unsupported encoding if 'preferred_chunks' in enc: del enc['preferred_chunks'] encoding[var_name] = enc temp_dir = tempfile.TemporaryDirectory(prefix='nc2zarr-slice-', suffix='.zarr') dataslice.to_zarr(temp_dir.name, encoding=encoding) slice_root_group = zarr.open(temp_dir.name, mode='r') slice_arrays = dict(slice_root_group.arrays()) root_group = zarr.open(store, mode='r+') for var_name, var_array in root_group.arrays(): if var_name in append_dim_var_names: slice_array = slice_arrays[var_name] if insert_mode: # Add one empty step empty = zarr.creation.empty(slice_array.shape, dtype=var_array.dtype) var_array.append(empty, axis=0) # Shift contents var_array[insert_index + 1:, ...] = \ var_array[insert_index:-1, ...] # Replace slice var_array[insert_index, ...] = slice_array[0] if consolidated: zarr.consolidate_metadata(store)
def open_group( cls, store, mode="r", synchronizer=None, group=None, consolidated=False, consolidate_on_close=False, chunk_store=None, storage_options=None, append_dim=None, write_region=None, safe_chunks=True, stacklevel=2, ): # zarr doesn't support pathlib.Path objects yet. zarr-python#601 if isinstance(store, pathlib.Path): store = os.fspath(store) open_kwargs = dict( mode=mode, synchronizer=synchronizer, path=group, ) if LooseVersion(zarr.__version__) >= "2.5.0": open_kwargs["storage_options"] = storage_options elif storage_options: raise ValueError("Storage options only compatible with zarr>=2.5.0") if chunk_store: open_kwargs["chunk_store"] = chunk_store if consolidated is None: consolidated = False if consolidated is None: try: zarr_group = zarr.open_consolidated(store, **open_kwargs) except KeyError: warnings.warn( "Failed to open Zarr store with consolidated metadata, " "falling back to try reading non-consolidated metadata. " "This is typically much slower for opening a dataset. " "To silence this warning, consider:\n" "1. Consolidating metadata in this existing store with " "zarr.consolidate_metadata().\n" "2. Explicitly setting consolidated=False, to avoid trying " "to read consolidate metadata, or\n" "3. Explicitly setting consolidated=True, to raise an " "error in this case instead of falling back to try " "reading non-consolidated metadata.", RuntimeWarning, stacklevel=stacklevel, ) zarr_group = zarr.open_group(store, **open_kwargs) elif consolidated: # TODO: an option to pass the metadata_key keyword zarr_group = zarr.open_consolidated(store, **open_kwargs) else: zarr_group = zarr.open_group(store, **open_kwargs) return cls( zarr_group, mode, consolidate_on_close, append_dim, write_region, safe_chunks, )
def variation_main_pass(self): path = self.base_dir / 'variation/main/zarr/pass/ag1000g.phase2.ar1.pass' return zarr.open_consolidated(str(path))
def haplotypes_main(self): path = self.base_dir / 'haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes' return zarr.open_consolidated(str(path))
def load_cmip( activity_ids: str = "CMIP", experiment_ids: str = "historical", member_ids: str = "r1i1p1f1", source_ids: str = "MIROC6", table_ids: str = "day", grid_labels: str = "gn", variable_ids: List[str] = ["tasmax"], return_type: str = 'zarr', ) -> xr.Dataset: """Loads CMIP6 GCM dataset based on input criteria. Parameters ---------- activity_ids : list, optional activity_ids in CMIP6 catalog, by default ["CMIP", "ScenarioMIP"], experiment_ids : list, optional experiment_ids in CMIP6 catalog, by default ["historical", "ssp370"], ex:# "ssp126", "ssp245", "ssp585" member_ids : list, optional member_ids in CMIP6 catalog, by default ["r1i1p1f1"] source_ids : list, optional source_ids in CMIP6 catalog, by default ["MIROC6"] table_ids : list, optional table_ids in CMIP6 catalog, by default ["day"] grid_labels : list, optional grid_labels in CMIP6 catalog, by default ["gn"] variable_ids : list, optional variable_ids in CMIP6 catalog, by default ['tasmax'] Returns ------- ds : xr.Dataset or zarr group Dataset or zarr group with CMIP data """ if isinstance(variable_ids, str): variable_ids = [variable_ids] col = cat.cmip6() for i, var in enumerate(variable_ids): stores = ( col.search( activity_id=activity_ids, experiment_id=experiment_ids, member_id=member_ids, source_id=source_ids, table_id=table_ids, grid_label=grid_labels, variable_id=[var], ) .df['zstore'] .to_list() ) storage_options = config.get('data_catalog.era5.storage_options') if len(stores) > 1: raise ValueError('can only get 1 store at a time') if return_type == 'zarr': ds = zarr.open_consolidated(stores[0], mode='r', storage_options=storage_options) elif return_type == 'xr': ds = xr.open_zarr(stores[0], consolidated=True, storage_options=storage_options) # flip the lats if necessary and drop the extra dims/vars like bnds ds = gcm_munge(ds) ds = lon_to_180(ds) # convert to mm/day - helpful to prevent rounding errors from very tiny numbers if var == 'pr': ds['pr'] *= 86400 if i == 0: ds_out = ds else: ds_out[var] = ds[var] return ds_out
def test_end_to_end_file_conversion(self, _callback_post): """ Full end-to-end test of the adapter from call to `main` to Harmony callbacks, including ensuring the contents of the file are correct. Mocks S3 interactions using @mock_s3. """ conn = boto3.resource('s3') conn.create_bucket(Bucket='example-bucket', CreateBucketConfiguration={ 'LocationConstraint': os.environ['AWS_DEFAULT_REGION'] }) netcdf_file = create_full_dataset() netcdf_file2 = create_full_dataset() try: message = mock_message_for(netcdf_file, netcdf_file2) main([ 'harmony_netcdf_to_zarr', '--harmony-action', 'invoke', '--harmony-input', message ], config=self.config) finally: os.remove(netcdf_file) os.remove(netcdf_file2) callbacks = parse_callbacks(_callback_post) # -- Progress and Callback Assertions -- # Assert that we got three callbacks, one for first file, one for the second, and the final message self.assertEqual(len(callbacks), 3) self.assertEqual(callbacks[0]['progress'], '50') self.assertEqual(callbacks[0]['item[type]'], 'application/x-zarr') self.assertEqual(callbacks[1]['progress'], '100') self.assertEqual(callbacks[1]['item[type]'], 'application/x-zarr') self.assertEqual(callbacks[2], {'status': 'successful'}) self.assertNotEqual(callbacks[0]['item[href]'], callbacks[1]['item[href]']) self.assertTrue(callbacks[0]['item[href]'].endswith('.zarr')) self.assertTrue(callbacks[1]['item[href]'].endswith('.zarr')) # Now calls back with spatial and temporal if present in the incoming message self.assertEqual(callbacks[0]['item[temporal]'], '2020-01-01T00:00:00.000Z,2020-01-02T00:00:00.000Z') self.assertEqual(callbacks[0]['item[bbox]'], '-11.1,-22.2,33.3,44.4') # Open the Zarr file that the adapter called back with zarr_location = callbacks[0]['item[href]'] store = s3fs.S3FileSystem().get_mapper(root=zarr_location, check=False) out = zarr.open_consolidated(store) # -- Hierarchical Structure Assertions -- contents = textwrap.dedent(""" / ├── data │ ├── horizontal │ │ ├── east (1, 3, 3) int64 │ │ └── west (1, 3, 3) float64 │ └── vertical │ ├── north (1, 3, 3) float64 │ └── south (1, 3, 3) float64 ├── location │ ├── lat (3, 3) float64 │ └── lon (3, 3) float64 └── time (1,) float64 """).strip() self.assertEqual(str(out.tree()), contents) # -- Metadata Assertions -- # Root level values self.assertEqual(dict(out.attrs), ROOT_METADATA_VALUES) # Group metadata self.assertEqual(out['data'].attrs['description'], 'Group to hold the data') # Variable metadata var = out['data/vertical/north'] self.assertEqual(var.attrs['coordinates'], 'lon lat') # -- Data Assertions -- # Nested Byte Arrays self.assertEqual(out['data/vertical/north'][0, 0, 2], 16) self.assertEqual(out['data/vertical/north'][0, 2, 0], 0) self.assertEqual(out['data/vertical/south'][0, 2, 0], 16) self.assertEqual(out['data/vertical/south'][0, 0, 2], 0) self.assertEqual(out['data/horizontal/east'][0, 2, 2], 16) # scale_factor = 2 self.assertEqual(out['data/horizontal/east'][0, 0, 0], 0) self.assertEqual(out['data/horizontal/west'][0, 0, 0], 16) self.assertEqual(out['data/horizontal/west'][0, 2, 2], 0) # 'east' attributes scale_factor removed self.assertFalse(hasattr(out['data/horizontal/east'], 'scale_factor')) # 'east' attributes present and scaled self.assertEqual(out['data/horizontal/east'].attrs['valid_range'], [0.0, 50.0]) self.assertEqual(out['data/horizontal/east'].attrs['valid_min'], 0.0) self.assertEqual(out['data/horizontal/east'].attrs['valid_max'], 50.0) self.assertEqual(out['data/horizontal/east'].attrs['_FillValue'], 254.0) self.assertFalse(hasattr(out['data/horizontal/east'], 'missing_value')) # 2D Nested Float Arrays self.assertEqual(out['location/lat'][0, 1], 5.5) self.assertEqual(out['location/lon'][0, 1], -5.5) # 1D Root-Level Float Array sharing its name with a dimension self.assertEqual(out['time'][0], 166536)
def variation_main_pass_biallelic(self): path = self.base_dir / 'variation/main/zarr/biallelic/ag1000g.phase2.ar1.pass.biallelic' return zarr.open_consolidated(str(path))
def vcfzarr_to_zarr( input: PathType, output: PathType, *, contigs: Optional[List[str]] = None, grouped_by_contig: bool = False, consolidated: bool = False, tempdir: Optional[PathType] = None, ) -> None: """Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format. Parameters ---------- input Path to the input Zarr file. output Path to the ouput Zarr file. contigs The contigs to convert. By default all contigs are converted. grouped_by_contig Whether there is one group for each contig in the Zarr file, by default False. consolidated Whether the Zarr file has consolidated metadata, by default False. tempdir Temporary directory where intermediate files are stored. The default None means use the system default temporary directory. """ if consolidated: vcfzarr = zarr.open_consolidated(str(input), mode="r") else: vcfzarr = zarr.open_group(str(input), mode="r") if not grouped_by_contig: ds = _vcfzarr_to_dataset(vcfzarr) ds.to_zarr(str(output)) else: # read each contig separately, concatenate, rechunk, then save to zarr contigs = contigs or list(vcfzarr.group_keys()) # Index the contig names _, variant_contig_names = encode_array(contigs) variant_contig_names = list(variant_contig_names) vars_to_rechunk = [] vars_to_copy = [] with tempfile.TemporaryDirectory(prefix="vcfzarr_to_zarr_", suffix=".zarr", dir=tempdir) as tmpdir: zarr_files = [] for i, contig in enumerate(contigs): # convert contig group to zarr and save in tmpdir ds = _vcfzarr_to_dataset(vcfzarr[contig], contig, variant_contig_names) if i == 0: for (var, arr) in ds.data_vars.items(): if arr.dims[0] == "variants": vars_to_rechunk.append(var) else: vars_to_copy.append(var) contig_zarr_file = Path(tmpdir) / contig ds.to_zarr(contig_zarr_file) zarr_files.append(str(contig_zarr_file)) concat_zarrs_optimized(zarr_files, output, vars_to_rechunk, vars_to_copy, fix_strings=True)
def vcfzarr_to_zarr( input: PathType, output: PathType, *, contigs: Optional[List[str]] = None, grouped_by_contig: bool = False, consolidated: bool = False, tempdir: Optional[PathType] = None, concat_algorithm: Optional[Literal["xarray_internal"]] = None, ) -> None: """Convert VCF Zarr files created using scikit-allel to a single Zarr on-disk store in sgkit Xarray format. Parameters ---------- input Path to the input Zarr file. output Path to the ouput Zarr file. contigs The contigs to convert. By default all contigs are converted. grouped_by_contig Whether there is one group for each contig in the Zarr file, by default False. consolidated Whether the Zarr file has consolidated metadata, by default False. tempdir Temporary directory where intermediate files are stored. The default None means use the system default temporary directory. concat_algorithm The algorithm to use to concatenate and rechunk Zarr files. The default None means use the optimized version suitable for large files, whereas ``xarray_internal`` will use built-in Xarray APIs, which can exhibit high memory usage, see https://github.com/dask/dask/issues/6745. """ if consolidated: vcfzarr = zarr.open_consolidated(str(input), mode="r") else: vcfzarr = zarr.open_group(str(input), mode="r") if not grouped_by_contig: ds = _vcfzarr_to_dataset(vcfzarr) ds.to_zarr(str(output)) else: # read each contig separately, concatenate, rechunk, then save to zarr contigs = contigs or list(vcfzarr.group_keys()) # Index the contig names _, variant_contig_names = encode_array(contigs) variant_contig_names = list(variant_contig_names) vars_to_rechunk = [] vars_to_copy = [] with tempfile.TemporaryDirectory(prefix="vcfzarr_to_zarr_", suffix=".zarr", dir=tempdir) as tmpdir: zarr_files = [] for i, contig in enumerate(contigs): # convert contig group to zarr and save in tmpdir ds = _vcfzarr_to_dataset(vcfzarr[contig], contig, variant_contig_names) if i == 0: for (var, arr) in ds.data_vars.items(): if arr.dims[0] == "variants": vars_to_rechunk.append(var) else: vars_to_copy.append(var) contig_zarr_file = Path(tmpdir) / contig ds.to_zarr(contig_zarr_file) zarr_files.append(str(contig_zarr_file)) if concat_algorithm == "xarray_internal": ds = zarrs_to_dataset(zarr_files) ds.to_zarr(output, mode="w") else: # Use the optimized algorithm in `concatenate_and_rechunk` _concat_zarrs_optimized(zarr_files, output, vars_to_rechunk, vars_to_copy)
def xds_from_zarr(store, columns=None, chunks=None, **kwargs): """ Reads the zarr data store in `store` and returns list of Dataset's containing the data. Parameters ---------- store : str or Path Path containing the data columns : list of str or str or None Columns to read. `None` or `"ALL"` stores all columns on each dataset. Otherwise, a list of columns should be supplied. chunks: dict or list of dicts chunking schema for each dataset **kwargs: optional Returns ------- writes : Dataset or list of Datasets Dataset(s) representing write operations """ if isinstance(store, DaskMSStore): pass elif isinstance(store, (Path, str)): store = DaskMSStore(f"{store}", **kwargs.pop("storage_options", {})) else: raise TypeError(f"store '{store}' must be " f"Path, str or DaskMSStore") # If any kwargs are added, they should be popped prior to this check. if len(kwargs) > 0: warnings.warn( f"The following unsupported kwargs were ignored in " f"xds_from_zarr: {kwargs}", UserWarning) columns = promote_columns(columns) if chunks is None: pass elif isinstance(chunks, (tuple, list)): if not all(isinstance(v, dict) for v in chunks): raise TypeError("chunks must be None, a dict or a list of dicts") elif isinstance(chunks, dict): chunks = [chunks] else: raise TypeError("chunks must be None, a dict or a list of dicts") datasets = [] numpy_vars = [] # NOTE(JSKenyon): Iterating over all the zarr groups/arrays is VERY # expensive if the metadata has not been consolidated. zc.consolidate_metadata(store.map) table_path = store.table if store.table else "MAIN" table_group = zarr.open_consolidated(store.map)[table_path] for g, (group_name, group) in enumerate(sorted(table_group.groups(), key=group_sortkey)): group_attrs = decode_attr(dict(group.attrs)) dask_ms_attrs = group_attrs.pop(DASKMS_ATTR_KEY) natural_chunks = dask_ms_attrs["chunks"] group_chunks = {d: tuple(dc) for d, dc in natural_chunks.items()} if chunks: # Defer to user-supplied chunking strategy try: group_chunks.update(chunks[g]) except IndexError: group_chunks.update(chunks[-1]) # Reuse last chunking. pass data_vars = {} coords = {} for name, zarray in column_iterator(group, columns): attrs = decode_attr(dict(zarray.attrs[DASKMS_ATTR_KEY])) dims = attrs["dims"] coordinate = attrs.get("coordinate", False) array_chunks = tuple( group_chunks.get(d, s) for d, s in zip(dims, zarray.shape)) array_chunks = da.core.normalize_chunks(array_chunks, zarray.shape) ext_args = extent_args(dims, array_chunks) token_name = f"read~{name}-{tokenize(zarray, *ext_args)}" read = da.blockwise(zarr_getter, dims, zarray, None, *ext_args, concatenate=False, name=token_name, meta=np.empty((0, ) * zarray.ndim, zarray.dtype)) read = inlined_array(read, ext_args[::2]) var = Variable(dims, read, attrs) (coords if coordinate else data_vars)[name] = var # Save numpy arrays for reification typ = decode_type(attrs["array_type"]) if typ is np.ndarray: numpy_vars.append(var) elif typ is da.Array: pass else: raise TypeError(f"Unknown array_type '{attrs['array_type']}'") datasets.append(Dataset(data_vars, coords=coords, attrs=group_attrs)) # Reify any numpy arrays directly into their variables for v, a in zip(numpy_vars, dask.compute(v.data for v in numpy_vars)[0]): v.data = a return datasets