Пример #1
0
def create_zvariables(dataset):
    """Helper function to create a dictionary of zarr encoded variables."""
    zvariables = {}

    for key, da in dataset.variables.items():
        encoded_da = encode_zarr_variable(da)
        zvariables[key] = encoded_da

    return zvariables
Пример #2
0
def create_zmetadata(dataset):
    """Helper function to create a consolidated zmetadata dictionary."""

    zmeta = {
        'zarr_consolidated_format': zarr_consolidated_format,
        'metadata': {}
    }
    zmeta['metadata'][group_meta_key] = {'zarr_format': zarr_format}
    zmeta['metadata'][attrs_key] = _extract_dataset_zattrs(dataset)

    for key, da in dataset.variables.items():
        encoded_da = encode_zarr_variable(da)
        encoding = extract_zarr_variable_encoding(da)
        zmeta['metadata'][f'{key}/{attrs_key}'] = _extract_dataarray_zattrs(
            encoded_da)
        zmeta['metadata'][f'{key}/{array_meta_key}'] = _extract_zarray(
            encoded_da, encoding, encoded_da.dtype)

    return zmeta
Пример #3
0
    def _get_zmetadata(self):
        """ helper method to create consolidated zmetadata dictionary """
        zmeta = {
            'zarr_consolidated_format': zarr_consolidated_format,
            'metadata': {}
        }
        zmeta['metadata'][group_meta_key] = {'zarr_format': zarr_format}
        zmeta['metadata'][attrs_key] = self._get_zattrs()

        for key, da in self._obj.variables.items():
            # encode variable
            encoded_da = encode_zarr_variable(da)
            self._variables[key] = encoded_da
            self._encoding[key] = extract_zarr_variable_encoding(da)
            zmeta['metadata'][f'{key}/{attrs_key}'] = _extract_zattrs(
                encoded_da)
            zmeta['metadata'][f'{key}/{array_meta_key}'] = extract_zarray(
                encoded_da, self._encoding[key], encoded_da.dtype)

        return zmeta
Пример #4
0
    def _get_zmetadata(self):
        """ helper method to create consolidated zmetadata dictionary """
        zmeta = {
            "zarr_consolidated_format": zarr_consolidated_format,
            "metadata": {}
        }
        zmeta["metadata"][group_meta_key] = {"zarr_format": zarr_format}
        zmeta["metadata"][attrs_key] = self._get_zattrs()

        for key, da in self._obj.variables.items():
            # encode variable
            encoded_da = encode_zarr_variable(da)
            self._variables[key] = encoded_da
            self._encoding[key] = _extract_zarr_variable_encoding(da)
            zmeta["metadata"][f"{key}/{attrs_key}"] = extract_zattrs(
                encoded_da)
            zmeta["metadata"][f"{key}/{array_meta_key}"] = extract_zarray(
                encoded_da, self._encoding.get(key, {}), da.encoding["dtype"])

        return zmeta
Пример #5
0
def rechunk_dataset(
    source: xarray.Dataset,
    encoding: Mapping,
    max_mem,
    target_store,
    temp_store=None,
    executor: Union[str, Executor] = "dask",
):
    def _encode_zarr_attributes(attrs):
        return {k: encode_zarr_attr_value(v) for k, v in attrs.items()}

    if isinstance(executor, str):
        executor = _get_executor(executor)
    if temp_store:
        temp_group = zarr.group(temp_store)
    else:
        temp_group = zarr.group(
            tempfile.mkdtemp(".zarr", "temp_store_")
        )  # pragma: no cover
    target_group = zarr.group(target_store)
    target_group.attrs.update(_encode_zarr_attributes(source.attrs))

    copy_specs = []
    for variable in source:
        array = source[variable].copy()

        # Update the array encoding with provided parameters and apply it
        has_chunk_encoding = "chunks" in array.encoding
        array.encoding.update(encoding.get(variable, {}))
        array = encode_zarr_variable(array)

        # Determine target chunking for array and remove it prior to
        # validation/extraction ONLY if the array isn't also coming
        # from a Zarr store (otherwise blocks need to be checked for overlap)
        target_chunks = array.encoding.get("chunks")
        if not has_chunk_encoding:
            array.encoding.pop("chunks", None)
        array_encoding = extract_zarr_variable_encoding(
            array, raise_on_invalid=True, name=variable
        )

        # Default to chunking based on array shape if not explicitly provided
        default_chunks = array_encoding.pop("chunks")
        target_chunks = target_chunks or default_chunks

        # Extract array attributes along with reserved property for
        # xarray dimension names
        array_attrs = _encode_zarr_attributes(array.attrs)
        array_attrs[DIMENSION_KEY] = encode_zarr_attr_value(array.dims)

        copy_spec = _setup_array_rechunk(
            dask.array.asarray(array),
            target_chunks,
            max_mem,
            target_group,
            target_options=array_encoding,
            temp_store_or_group=temp_group,
            temp_options=array_encoding,
            name=variable,
        )
        copy_spec.write.array.attrs.update(array_attrs)  # type: ignore
        copy_specs.append(copy_spec)
    plan = executor.prepare_plan(copy_specs)
    return Rechunked(executor, plan, source, temp_group, target_group)
Пример #6
0
def _setup_rechunk(
    source,
    target_chunks,
    max_mem,
    target_store,
    target_options=None,
    temp_store=None,
    temp_options=None,
):
    if temp_options is None:
        temp_options = target_options
    target_options = target_options or {}
    temp_options = temp_options or {}

    if isinstance(source, xarray.Dataset):
        if not isinstance(target_chunks, dict):
            raise ValueError(
                "You must specify ``target-chunks`` as a dict when rechunking a dataset."
            )

        variables, attrs = encode_dataset_coordinates(source)
        attrs = _encode_zarr_attributes(attrs)

        if temp_store:
            temp_group = zarr.group(temp_store)
        else:
            temp_group = None
        target_group = zarr.group(target_store)
        target_group.attrs.update(attrs)

        copy_specs = []
        for name, variable in variables.items():
            # This isn't strictly necessary because a shallow copy
            # also occurs in `encode_dataset_coordinates` but do it
            # anyways in case the coord encoding function changes
            variable = variable.copy()

            # Update the array encoding with provided options and apply it;
            # note that at this point the `options` may contain any valid property
            # applicable for the `encoding` parameter in Dataset.to_zarr other than "chunks"
            options = target_options.get(name, {})
            if "chunks" in options:
                raise ValueError(
                    f"Chunks must be provided in ``target_chunks`` rather than options (variable={name})"
                )
            variable.encoding.update(options)
            variable = encode_zarr_variable(variable)

            # Extract the array encoding to get a default chunking, a step
            # which will also ensure that the target chunking is compatible
            # with the current chunking (only necessary for on-disk arrays)
            variable_encoding = extract_zarr_variable_encoding(
                variable, raise_on_invalid=False, name=name)
            variable_chunks = target_chunks.get(name,
                                                variable_encoding["chunks"])

            # Restrict options to only those that are specific to zarr and
            # not managed internally
            options = {k: v for k, v in options.items() if k in ZARR_OPTIONS}
            _validate_options(options)

            # Extract array attributes along with reserved property for
            # xarray dimension names
            variable_attrs = _encode_zarr_attributes(variable.attrs)
            variable_attrs[DIMENSION_KEY] = encode_zarr_attr_value(
                variable.dims)

            copy_spec = _setup_array_rechunk(
                dask.array.asarray(variable),
                variable_chunks,
                max_mem,
                target_group,
                target_options=options,
                temp_store_or_group=temp_group,
                temp_options=options,
                name=name,
            )
            copy_spec.write.array.attrs.update(variable_attrs)  # type: ignore
            copy_specs.append(copy_spec)

        return copy_specs, temp_group, target_group

    elif isinstance(source, zarr.hierarchy.Group):
        if not isinstance(target_chunks, dict):
            raise ValueError(
                "You must specify ``target-chunks`` as a dict when rechunking a group."
            )

        if temp_store:
            temp_group = zarr.group(temp_store)
        else:
            temp_group = None
        target_group = zarr.group(target_store)
        target_group.attrs.update(source.attrs)

        copy_specs = []
        for array_name, array_target_chunks in target_chunks.items():
            copy_spec = _setup_array_rechunk(
                source[array_name],
                array_target_chunks,
                max_mem,
                target_group,
                target_options=target_options.get(array_name),
                temp_store_or_group=temp_group,
                temp_options=temp_options.get(array_name),
                name=array_name,
            )
            copy_specs.append(copy_spec)

        return copy_specs, temp_group, target_group

    elif isinstance(source, (zarr.core.Array, dask.array.Array)):

        copy_spec = _setup_array_rechunk(
            source,
            target_chunks,
            max_mem,
            target_store,
            target_options=target_options,
            temp_store_or_group=temp_store,
            temp_options=temp_options,
        )
        intermediate = copy_spec.intermediate.array
        target = copy_spec.write.array
        return [copy_spec], intermediate, target

    else:
        raise ValueError(
            f"Source must be a Zarr Array, Zarr Group, Dask Array or Xarray Dataset (not {type(source)})."
        )