Exemplo n.º 1
0
def _extract_dataset_zattrs(dataset: xr.Dataset):
    """ helper function to create zattrs dictionary from Dataset global attrs """
    zattrs = {}
    for k, v in dataset.attrs.items():
        zattrs[k] = encode_zarr_attr_value(v)

    # remove xpublish internal attribute
    zattrs.pop(DATASET_ID_ATTR_KEY, None)

    return zattrs
Exemplo n.º 2
0
def _extract_dataarray_zattrs(da):
    """ helper function to extract zattrs dictionary from DataArray """
    zattrs = {}
    for k, v in da.attrs.items():
        zattrs[k] = encode_zarr_attr_value(v)
    zattrs[DIMENSION_KEY] = list(da.dims)

    # We don't want `_FillValue` in `.zattrs`
    # It should go in `fill_value` section of `.zarray`
    _ = zattrs.pop('_FillValue', None)

    return zattrs
Exemplo n.º 3
0
 def _encode_zarr_attributes(attrs):
     return {k: encode_zarr_attr_value(v) for k, v in attrs.items()}
Exemplo n.º 4
0
def rechunk_dataset(
    source: xarray.Dataset,
    encoding: Mapping,
    max_mem,
    target_store,
    temp_store=None,
    executor: Union[str, Executor] = "dask",
):
    def _encode_zarr_attributes(attrs):
        return {k: encode_zarr_attr_value(v) for k, v in attrs.items()}

    if isinstance(executor, str):
        executor = _get_executor(executor)
    if temp_store:
        temp_group = zarr.group(temp_store)
    else:
        temp_group = zarr.group(
            tempfile.mkdtemp(".zarr", "temp_store_")
        )  # pragma: no cover
    target_group = zarr.group(target_store)
    target_group.attrs.update(_encode_zarr_attributes(source.attrs))

    copy_specs = []
    for variable in source:
        array = source[variable].copy()

        # Update the array encoding with provided parameters and apply it
        has_chunk_encoding = "chunks" in array.encoding
        array.encoding.update(encoding.get(variable, {}))
        array = encode_zarr_variable(array)

        # Determine target chunking for array and remove it prior to
        # validation/extraction ONLY if the array isn't also coming
        # from a Zarr store (otherwise blocks need to be checked for overlap)
        target_chunks = array.encoding.get("chunks")
        if not has_chunk_encoding:
            array.encoding.pop("chunks", None)
        array_encoding = extract_zarr_variable_encoding(
            array, raise_on_invalid=True, name=variable
        )

        # Default to chunking based on array shape if not explicitly provided
        default_chunks = array_encoding.pop("chunks")
        target_chunks = target_chunks or default_chunks

        # Extract array attributes along with reserved property for
        # xarray dimension names
        array_attrs = _encode_zarr_attributes(array.attrs)
        array_attrs[DIMENSION_KEY] = encode_zarr_attr_value(array.dims)

        copy_spec = _setup_array_rechunk(
            dask.array.asarray(array),
            target_chunks,
            max_mem,
            target_group,
            target_options=array_encoding,
            temp_store_or_group=temp_group,
            temp_options=array_encoding,
            name=variable,
        )
        copy_spec.write.array.attrs.update(array_attrs)  # type: ignore
        copy_specs.append(copy_spec)
    plan = executor.prepare_plan(copy_specs)
    return Rechunked(executor, plan, source, temp_group, target_group)
Exemplo n.º 5
0
def _setup_rechunk(
    source,
    target_chunks,
    max_mem,
    target_store,
    target_options=None,
    temp_store=None,
    temp_options=None,
):
    if temp_options is None:
        temp_options = target_options
    target_options = target_options or {}
    temp_options = temp_options or {}

    if isinstance(source, xarray.Dataset):
        if not isinstance(target_chunks, dict):
            raise ValueError(
                "You must specify ``target-chunks`` as a dict when rechunking a dataset."
            )

        variables, attrs = encode_dataset_coordinates(source)
        attrs = _encode_zarr_attributes(attrs)

        if temp_store:
            temp_group = zarr.group(temp_store)
        else:
            temp_group = None
        target_group = zarr.group(target_store)
        target_group.attrs.update(attrs)

        copy_specs = []
        for name, variable in variables.items():
            # This isn't strictly necessary because a shallow copy
            # also occurs in `encode_dataset_coordinates` but do it
            # anyways in case the coord encoding function changes
            variable = variable.copy()

            # Update the array encoding with provided options and apply it;
            # note that at this point the `options` may contain any valid property
            # applicable for the `encoding` parameter in Dataset.to_zarr other than "chunks"
            options = target_options.get(name, {})
            if "chunks" in options:
                raise ValueError(
                    f"Chunks must be provided in ``target_chunks`` rather than options (variable={name})"
                )
            variable.encoding.update(options)
            variable = encode_zarr_variable(variable)

            # Extract the array encoding to get a default chunking, a step
            # which will also ensure that the target chunking is compatible
            # with the current chunking (only necessary for on-disk arrays)
            variable_encoding = extract_zarr_variable_encoding(
                variable, raise_on_invalid=False, name=name)
            variable_chunks = target_chunks.get(name,
                                                variable_encoding["chunks"])

            # Restrict options to only those that are specific to zarr and
            # not managed internally
            options = {k: v for k, v in options.items() if k in ZARR_OPTIONS}
            _validate_options(options)

            # Extract array attributes along with reserved property for
            # xarray dimension names
            variable_attrs = _encode_zarr_attributes(variable.attrs)
            variable_attrs[DIMENSION_KEY] = encode_zarr_attr_value(
                variable.dims)

            copy_spec = _setup_array_rechunk(
                dask.array.asarray(variable),
                variable_chunks,
                max_mem,
                target_group,
                target_options=options,
                temp_store_or_group=temp_group,
                temp_options=options,
                name=name,
            )
            copy_spec.write.array.attrs.update(variable_attrs)  # type: ignore
            copy_specs.append(copy_spec)

        return copy_specs, temp_group, target_group

    elif isinstance(source, zarr.hierarchy.Group):
        if not isinstance(target_chunks, dict):
            raise ValueError(
                "You must specify ``target-chunks`` as a dict when rechunking a group."
            )

        if temp_store:
            temp_group = zarr.group(temp_store)
        else:
            temp_group = None
        target_group = zarr.group(target_store)
        target_group.attrs.update(source.attrs)

        copy_specs = []
        for array_name, array_target_chunks in target_chunks.items():
            copy_spec = _setup_array_rechunk(
                source[array_name],
                array_target_chunks,
                max_mem,
                target_group,
                target_options=target_options.get(array_name),
                temp_store_or_group=temp_group,
                temp_options=temp_options.get(array_name),
                name=array_name,
            )
            copy_specs.append(copy_spec)

        return copy_specs, temp_group, target_group

    elif isinstance(source, (zarr.core.Array, dask.array.Array)):

        copy_spec = _setup_array_rechunk(
            source,
            target_chunks,
            max_mem,
            target_store,
            target_options=target_options,
            temp_store_or_group=temp_store,
            temp_options=temp_options,
        )
        intermediate = copy_spec.intermediate.array
        target = copy_spec.write.array
        return [copy_spec], intermediate, target

    else:
        raise ValueError(
            f"Source must be a Zarr Array, Zarr Group, Dask Array or Xarray Dataset (not {type(source)})."
        )
Exemplo n.º 6
0
 def _get_zattrs(self):
     """ helper method to create zattrs dictionary """
     zattrs = {}
     for k, v in self._obj.attrs.items():
         zattrs[k] = encode_zarr_attr_value(v)
     return zattrs
Exemplo n.º 7
0
def _extract_dataset_zattrs(dataset: xr.Dataset):
    """ helper function to create zattrs dictionary from Dataset global attrs """
    zattrs = {}
    for k, v in dataset.attrs.items():
        zattrs[k] = encode_zarr_attr_value(v)
    return zattrs