Пример #1
0
def chunk(cube, output, format=None, params=None, chunks=None):
    """
    (Re-)chunk xcube dataset.
    Changes the external chunking of all variables of CUBE according to CHUNKS and writes
    the result to OUTPUT.
    """
    chunk_sizes = None
    if chunks:
        chunk_sizes = parse_cli_kwargs(chunks, metavar="CHUNKS")
        for k, v in chunk_sizes.items():
            if not isinstance(v, int) or v <= 0:
                raise click.ClickException("Invalid value for CHUNKS, "
                                           f"chunk sizes must be positive integers: {chunks}")

    write_kwargs = dict()
    if params:
        write_kwargs = parse_cli_kwargs(params, metavar="PARAMS")

    from xcube.util.dsio import guess_dataset_format
    format_name = format if format else guess_dataset_format(output)

    from xcube.api import open_dataset, chunk_dataset, write_dataset

    with open_dataset(input_path=cube) as ds:
        if chunk_sizes:
            for k in chunk_sizes:
                if k not in ds.dims:
                    raise click.ClickException("Invalid value for CHUNKS, "
                                               f"{k!r} is not the name of any dimension: {chunks}")

        chunked_dataset = chunk_dataset(ds, chunk_sizes=chunk_sizes, format_name=format_name)
        write_dataset(chunked_dataset, output_path=output, format_name=format_name, **write_kwargs)
Пример #2
0
def _prune(input_path: str = None, dry_run: bool = False, monitor=None):
    from xcube.api import open_cube
    from xcube.util.dsio import guess_dataset_format

    input_format = guess_dataset_format(input_path)
    if input_format != FORMAT_NAME_ZARR:
        raise click.ClickException("input must be a cube in ZARR format")

    monitor(f'Opening cube from {input_path!r}...')
    with open_cube(input_path) as cube:
        monitor('Identifying empty blocks...')
        empty_chunks = get_empty_dataset_chunks(cube)

    num_deleted = 0
    for var_name, chunk_indices in empty_chunks.items():
        monitor(
            f'Deleting {len(chunk_indices)} empty block file(s) for variable {var_name!r}...'
        )
        for chunk_index in chunk_indices:
            ok = _delete_block_file(input_path, var_name, chunk_index, dry_run,
                                    monitor)
            if ok:
                num_deleted += 1

    monitor(f'Done, {num_deleted} block file(s) deleted.')
Пример #3
0
def read_dataset(input_path: str,
                 format_name: str = None,
                 is_cube: bool = False,
                 **kwargs) -> xr.Dataset:
    """
    Read dataset from *input_path*.
    If *format* is not provided it will be guessed from *output_path*.

    :param input_path: input path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param is_cube: Weather a ValueError will be raised, if the dataset read from *input_path* is not a data cube.
    :param kwargs: format-specific keyword arguments
    :return: dataset object
    """
    format_name = format_name if format_name else guess_dataset_format(
        input_path)
    if format_name is None:
        raise ValueError("Unknown input format")
    dataset_io = find_dataset_io(format_name, modes=["r"])
    if dataset_io is None:
        raise ValueError(
            f"Unknown input format {format_name!r} for {input_path}")
    dataset = dataset_io.read(input_path, **kwargs)
    if is_cube:
        assert_cube(dataset)
    return dataset
Пример #4
0
def write_dataset(dataset: xr.Dataset,
                  output_path: str,
                  format_name: str = None,
                  **kwargs) -> xr.Dataset:
    """
    Write dataset to *output_path*.
    If *format* is not provided it will be guessed from *output_path*.

    :param dataset: Dataset to be written.
    :param output_path: output path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param kwargs: format-specific keyword arguments
    :return: the input dataset
    """
    format_name = format_name if format_name else guess_dataset_format(
        output_path)
    if format_name is None:
        raise ValueError("Unknown output format")
    dataset_io = find_dataset_io(format_name, modes=["w"])
    if dataset_io is None:
        raise ValueError(
            f"Unknown output format {format_name!r} for {output_path}")

    dataset_io.write(dataset, output_path, **kwargs)

    return dataset
Пример #5
0
def vars2dim(cube, var_name, dim_name, output=None, format=None):
    """
    Convert cube variables into new dimension.
    Moves all variables of <cube> into into a single new variable <var-name>
    with a new dimension <dim-name> and writes the results to <output>.
    """

    from xcube.util.dsio import guess_dataset_format
    from xcube.api import open_dataset, vars_to_dim, write_dataset
    import os

    if not output:
        dirname = os.path.dirname(cube)
        basename = os.path.basename(cube)
        basename, ext = os.path.splitext(basename)
        output = os.path.join(dirname, basename + '-vars2dim' + ext)

    format_name = format if format else guess_dataset_format(output)

    with open_dataset(input_path=cube) as ds:
        converted_dataset = vars_to_dim(ds,
                                        dim_name=dim_name,
                                        var_name=var_name)
        write_dataset(converted_dataset,
                      output_path=output,
                      format_name=format_name)
Пример #6
0
def _resample_in_time(input_path: str = None,
                      variables: Sequence[str] = None,
                      metadata: Dict[str, Any] = None,
                      output_path: str = DEFAULT_OUTPUT_PATH,
                      output_format: str = None,
                      methods: Sequence[str] = (DEFAULT_RESAMPLING_METHOD, ),
                      frequency: str = DEFAULT_RESAMPLING_FREQUENCY,
                      offset: str = None,
                      base: str = DEFAULT_RESAMPLING_BASE,
                      interp_kind: str = DEFAULT_INTERPOLATION_KIND,
                      tolerance: str = None,
                      dry_run: bool = False,
                      monitor=None):
    from xcube.api import open_cube
    from xcube.api.readwrite import write_cube
    from xcube.api.resample import resample_in_time
    from xcube.util.dsio import guess_dataset_format

    if not output_format:
        output_format = guess_dataset_format(output_path)

    monitor(f'Opening cube from {input_path!r}...')
    with open_cube(input_path) as ds:

        monitor('Resampling...')
        agg_ds = resample_in_time(ds,
                                  frequency=frequency,
                                  method=methods,
                                  offset=offset,
                                  base=base,
                                  interp_kind=interp_kind,
                                  tolerance=tolerance,
                                  var_names=variables,
                                  metadata=metadata)

        monitor(f'Writing resampled cube to {output_path!r}...')
        if not dry_run:
            write_cube(agg_ds, output_path, output_format, cube_asserted=True)

        monitor(f'Done.')
Пример #7
0
def apply(output: str,
          script: str,
          input: str,
          params: str,
          vars: str,
          dask: str,
          format: str,
          dtype: str):
    """
    Apply a function to data cubes.
    The function is used to transform N chunks of equal shape to a new chunk of same shape.
    N is the number of variables from all data cubes.

    Uses the Python program <script> to transform data cubes
    given by <inputs> into a new data cube given by <output>.

    The <script> must define a function ``apply(*variables, **params)`` where variables
    are numpy arrays (chunks) in the order given by <vars> or given by the variables returned by
    an optional ``init()`` function that my be defined in <script>.
    If neither <vars> nor an ``init()`` function is defined, all variables are passed in arbitrary order.

    The optional ``init(*cubes, **params)`` function can be used to validate the data cubes,
    extract the desired variables in desired order and to provide some extra processing parameters passed to
    the ``apply()`` function. The ``init()`` argument *cubes* are the ``xarray.Dataset`` objects
    according to <input> and *params* are according to <params>. The return value of ``init()`` is
    a tuple (*variables*, *new_params*) where *variables* is a list of ``xarray.DataArray`` objects and
    *new_params* are newly computed parameters passed to ``apply()``.
    """

    input_paths = input
    output_path = output
    apply_function_name = "apply"
    init_function_name = "init"

    with open(script, "r") as fp:
        code = fp.read()

    locals_dict = dict()
    exec(code, globals(), locals_dict)

    var_names = list(map(lambda s: s.strip(), vars.split(","))) if vars else None

    init_function = locals_dict.get(init_function_name)
    if init_function is not None and not callable(init_function):
        raise click.ClickException(f"{init_function_name!r} in {script} is not a callable")

    apply_function = locals_dict.get(apply_function_name)
    if apply_function is None:
        raise click.ClickException(f"missing function {apply_function_name!r} in {script}")
    if not callable(apply_function):
        raise click.ClickException(f"{apply_function!r} in {script} is not a callable")

    from xcube.api import read_cube
    from xcube.util.cliutil import parse_cli_kwargs
    from xcube.util.dsio import guess_dataset_format, find_dataset_io

    kwargs = parse_cli_kwargs(params, "<params>")
    input_cube_0 = None
    input_cubes = []
    for input_path in input_paths:
        input_cube = read_cube(input_path=input_path)
        if input_cube_0 is None:
            input_cube_0 = input_cube
        else:
            # TODO (forman): make sure input_cube's and input_cube_0's coords and chunking are compatible
            pass
        input_cubes.append(input_cube)

    if var_names:
        input_cubes = [input_cube.drop(labels=set(input_cube.data_vars).difference(set(var_names)))
                       for input_cube in input_cubes]

    import xarray as xr
    if init_function:
        variables, params = init_function(*input_cubes, **kwargs)
    else:
        variables, params = xr.merge(input_cubes).data_vars.values(), kwargs

    output_variable = xr.apply_ufunc(apply_function,
                                     *variables,
                                     dask=dask,
                                     output_dtypes=[dtype] if dask == "parallelized" else None)

    format = format or guess_dataset_format(output_path)
    dataset_io = find_dataset_io(format, {"w"})
    dataset_io.write(xr.Dataset(dict(output=output_variable)), output_path)