def create_zmetadata(dataset): """Helper function to create a consolidated zmetadata dictionary.""" zmeta = { 'zarr_consolidated_format': zarr_consolidated_format, 'metadata': {} } zmeta['metadata'][group_meta_key] = {'zarr_format': zarr_format} zmeta['metadata'][attrs_key] = _extract_dataset_zattrs(dataset) for key, da in dataset.variables.items(): encoded_da = encode_zarr_variable(da) encoding = extract_zarr_variable_encoding(da) zmeta['metadata'][f'{key}/{attrs_key}'] = _extract_dataarray_zattrs( encoded_da) zmeta['metadata'][f'{key}/{array_meta_key}'] = _extract_zarray( encoded_da, encoding, encoded_da.dtype) return zmeta
def _get_zmetadata(self): """ helper method to create consolidated zmetadata dictionary """ zmeta = { 'zarr_consolidated_format': zarr_consolidated_format, 'metadata': {} } zmeta['metadata'][group_meta_key] = {'zarr_format': zarr_format} zmeta['metadata'][attrs_key] = self._get_zattrs() for key, da in self._obj.variables.items(): # encode variable encoded_da = encode_zarr_variable(da) self._variables[key] = encoded_da self._encoding[key] = extract_zarr_variable_encoding(da) zmeta['metadata'][f'{key}/{attrs_key}'] = _extract_zattrs( encoded_da) zmeta['metadata'][f'{key}/{array_meta_key}'] = extract_zarray( encoded_da, self._encoding[key], encoded_da.dtype) return zmeta
def rechunk_dataset( source: xarray.Dataset, encoding: Mapping, max_mem, target_store, temp_store=None, executor: Union[str, Executor] = "dask", ): def _encode_zarr_attributes(attrs): return {k: encode_zarr_attr_value(v) for k, v in attrs.items()} if isinstance(executor, str): executor = _get_executor(executor) if temp_store: temp_group = zarr.group(temp_store) else: temp_group = zarr.group( tempfile.mkdtemp(".zarr", "temp_store_") ) # pragma: no cover target_group = zarr.group(target_store) target_group.attrs.update(_encode_zarr_attributes(source.attrs)) copy_specs = [] for variable in source: array = source[variable].copy() # Update the array encoding with provided parameters and apply it has_chunk_encoding = "chunks" in array.encoding array.encoding.update(encoding.get(variable, {})) array = encode_zarr_variable(array) # Determine target chunking for array and remove it prior to # validation/extraction ONLY if the array isn't also coming # from a Zarr store (otherwise blocks need to be checked for overlap) target_chunks = array.encoding.get("chunks") if not has_chunk_encoding: array.encoding.pop("chunks", None) array_encoding = extract_zarr_variable_encoding( array, raise_on_invalid=True, name=variable ) # Default to chunking based on array shape if not explicitly provided default_chunks = array_encoding.pop("chunks") target_chunks = target_chunks or default_chunks # Extract array attributes along with reserved property for # xarray dimension names array_attrs = _encode_zarr_attributes(array.attrs) array_attrs[DIMENSION_KEY] = encode_zarr_attr_value(array.dims) copy_spec = _setup_array_rechunk( dask.array.asarray(array), target_chunks, max_mem, target_group, target_options=array_encoding, temp_store_or_group=temp_group, temp_options=array_encoding, name=variable, ) copy_spec.write.array.attrs.update(array_attrs) # type: ignore copy_specs.append(copy_spec) plan = executor.prepare_plan(copy_specs) return Rechunked(executor, plan, source, temp_group, target_group)
def _setup_rechunk( source, target_chunks, max_mem, target_store, target_options=None, temp_store=None, temp_options=None, ): if temp_options is None: temp_options = target_options target_options = target_options or {} temp_options = temp_options or {} if isinstance(source, xarray.Dataset): if not isinstance(target_chunks, dict): raise ValueError( "You must specify ``target-chunks`` as a dict when rechunking a dataset." ) variables, attrs = encode_dataset_coordinates(source) attrs = _encode_zarr_attributes(attrs) if temp_store: temp_group = zarr.group(temp_store) else: temp_group = None target_group = zarr.group(target_store) target_group.attrs.update(attrs) copy_specs = [] for name, variable in variables.items(): # This isn't strictly necessary because a shallow copy # also occurs in `encode_dataset_coordinates` but do it # anyways in case the coord encoding function changes variable = variable.copy() # Update the array encoding with provided options and apply it; # note that at this point the `options` may contain any valid property # applicable for the `encoding` parameter in Dataset.to_zarr other than "chunks" options = target_options.get(name, {}) if "chunks" in options: raise ValueError( f"Chunks must be provided in ``target_chunks`` rather than options (variable={name})" ) variable.encoding.update(options) variable = encode_zarr_variable(variable) # Extract the array encoding to get a default chunking, a step # which will also ensure that the target chunking is compatible # with the current chunking (only necessary for on-disk arrays) variable_encoding = extract_zarr_variable_encoding( variable, raise_on_invalid=False, name=name) variable_chunks = target_chunks.get(name, variable_encoding["chunks"]) # Restrict options to only those that are specific to zarr and # not managed internally options = {k: v for k, v in options.items() if k in ZARR_OPTIONS} _validate_options(options) # Extract array attributes along with reserved property for # xarray dimension names variable_attrs = _encode_zarr_attributes(variable.attrs) variable_attrs[DIMENSION_KEY] = encode_zarr_attr_value( variable.dims) copy_spec = _setup_array_rechunk( dask.array.asarray(variable), variable_chunks, max_mem, target_group, target_options=options, temp_store_or_group=temp_group, temp_options=options, name=name, ) copy_spec.write.array.attrs.update(variable_attrs) # type: ignore copy_specs.append(copy_spec) return copy_specs, temp_group, target_group elif isinstance(source, zarr.hierarchy.Group): if not isinstance(target_chunks, dict): raise ValueError( "You must specify ``target-chunks`` as a dict when rechunking a group." ) if temp_store: temp_group = zarr.group(temp_store) else: temp_group = None target_group = zarr.group(target_store) target_group.attrs.update(source.attrs) copy_specs = [] for array_name, array_target_chunks in target_chunks.items(): copy_spec = _setup_array_rechunk( source[array_name], array_target_chunks, max_mem, target_group, target_options=target_options.get(array_name), temp_store_or_group=temp_group, temp_options=temp_options.get(array_name), name=array_name, ) copy_specs.append(copy_spec) return copy_specs, temp_group, target_group elif isinstance(source, (zarr.core.Array, dask.array.Array)): copy_spec = _setup_array_rechunk( source, target_chunks, max_mem, target_store, target_options=target_options, temp_store_or_group=temp_store, temp_options=temp_options, ) intermediate = copy_spec.intermediate.array target = copy_spec.write.array return [copy_spec], intermediate, target else: raise ValueError( f"Source must be a Zarr Array, Zarr Group, Dask Array or Xarray Dataset (not {type(source)})." )