Exemplo n.º 1
0
def write_dataset(dataset: xr.Dataset,
                  output_path: str,
                  format_name: str = None,
                  **kwargs) -> xr.Dataset:
    """
    Write dataset to *output_path*.
    If *format* is not provided it will be guessed from *output_path*.

    :param dataset: Dataset to be written.
    :param output_path: output path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param kwargs: format-specific keyword arguments
    :return: the input dataset
    """
    format_name = format_name if format_name else guess_dataset_format(
        output_path)
    if format_name is None:
        raise ValueError("Unknown output format")
    dataset_io = find_dataset_io(format_name, modes=["w"])
    if dataset_io is None:
        raise ValueError(
            f"Unknown output format {format_name!r} for {output_path}")

    dataset_io.write(dataset, output_path, **kwargs)

    return dataset
Exemplo n.º 2
0
def read_dataset(input_path: str,
                 format_name: str = None,
                 is_cube: bool = False,
                 **kwargs) -> xr.Dataset:
    """
    Read dataset from *input_path*.
    If *format* is not provided it will be guessed from *output_path*.

    :param input_path: input path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param is_cube: Weather a ValueError will be raised, if the dataset read from *input_path* is not a data cube.
    :param kwargs: format-specific keyword arguments
    :return: dataset object
    """
    format_name = format_name if format_name else guess_dataset_format(
        input_path)
    if format_name is None:
        raise ValueError("Unknown input format")
    dataset_io = find_dataset_io(format_name, modes=["r"])
    if dataset_io is None:
        raise ValueError(
            f"Unknown input format {format_name!r} for {input_path}")
    dataset = dataset_io.read(input_path, **kwargs)
    if is_cube:
        assert_cube(dataset)
    return dataset
Exemplo n.º 3
0
    def test_find_by_name(self):
        ds_io = find_dataset_io('netcdf4')
        self.assertIsInstance(ds_io, Netcdf4DatasetIO)

        ds_io = find_dataset_io('zarr', modes=['a'])
        self.assertIsInstance(ds_io, ZarrDatasetIO)
        ds_io = find_dataset_io('zarr', modes=['w'])
        self.assertIsInstance(ds_io, ZarrDatasetIO)
        ds_io = find_dataset_io('zarr', modes=['r'])
        self.assertIsInstance(ds_io, ZarrDatasetIO)

        ds_io = find_dataset_io('mem')
        self.assertIsInstance(ds_io, MemDatasetIO)

        ds_io = find_dataset_io('bibo', default=MemDatasetIO())
        self.assertIsInstance(ds_io, MemDatasetIO)
Exemplo n.º 4
0
 def test_find_by_ext(self):
     ds_io = find_dataset_io('nc')
     self.assertIsInstance(ds_io, Netcdf4DatasetIO)
Exemplo n.º 5
0
def apply(output: str,
          script: str,
          input: str,
          params: str,
          vars: str,
          dask: str,
          format: str,
          dtype: str):
    """
    Apply a function to data cubes.
    The function is used to transform N chunks of equal shape to a new chunk of same shape.
    N is the number of variables from all data cubes.

    Uses the Python program <script> to transform data cubes
    given by <inputs> into a new data cube given by <output>.

    The <script> must define a function ``apply(*variables, **params)`` where variables
    are numpy arrays (chunks) in the order given by <vars> or given by the variables returned by
    an optional ``init()`` function that my be defined in <script>.
    If neither <vars> nor an ``init()`` function is defined, all variables are passed in arbitrary order.

    The optional ``init(*cubes, **params)`` function can be used to validate the data cubes,
    extract the desired variables in desired order and to provide some extra processing parameters passed to
    the ``apply()`` function. The ``init()`` argument *cubes* are the ``xarray.Dataset`` objects
    according to <input> and *params* are according to <params>. The return value of ``init()`` is
    a tuple (*variables*, *new_params*) where *variables* is a list of ``xarray.DataArray`` objects and
    *new_params* are newly computed parameters passed to ``apply()``.
    """

    input_paths = input
    output_path = output
    apply_function_name = "apply"
    init_function_name = "init"

    with open(script, "r") as fp:
        code = fp.read()

    locals_dict = dict()
    exec(code, globals(), locals_dict)

    var_names = list(map(lambda s: s.strip(), vars.split(","))) if vars else None

    init_function = locals_dict.get(init_function_name)
    if init_function is not None and not callable(init_function):
        raise click.ClickException(f"{init_function_name!r} in {script} is not a callable")

    apply_function = locals_dict.get(apply_function_name)
    if apply_function is None:
        raise click.ClickException(f"missing function {apply_function_name!r} in {script}")
    if not callable(apply_function):
        raise click.ClickException(f"{apply_function!r} in {script} is not a callable")

    from xcube.api import read_cube
    from xcube.util.cliutil import parse_cli_kwargs
    from xcube.util.dsio import guess_dataset_format, find_dataset_io

    kwargs = parse_cli_kwargs(params, "<params>")
    input_cube_0 = None
    input_cubes = []
    for input_path in input_paths:
        input_cube = read_cube(input_path=input_path)
        if input_cube_0 is None:
            input_cube_0 = input_cube
        else:
            # TODO (forman): make sure input_cube's and input_cube_0's coords and chunking are compatible
            pass
        input_cubes.append(input_cube)

    if var_names:
        input_cubes = [input_cube.drop(labels=set(input_cube.data_vars).difference(set(var_names)))
                       for input_cube in input_cubes]

    import xarray as xr
    if init_function:
        variables, params = init_function(*input_cubes, **kwargs)
    else:
        variables, params = xr.merge(input_cubes).data_vars.values(), kwargs

    output_variable = xr.apply_ufunc(apply_function,
                                     *variables,
                                     dask=dask,
                                     output_dtypes=[dtype] if dask == "parallelized" else None)

    format = format or guess_dataset_format(output_path)
    dataset_io = find_dataset_io(format, {"w"})
    dataset_io.write(xr.Dataset(dict(output=output_variable)), output_path)