Exemplo n.º 1
0
 def test_dry_run(self):
     result = self.invoke_cli(['prune', self.TEST_CUBE, "--dry-run"])
     self.assertEqual(0, result.exit_code)
     self.assertEqual(
         "Opening cube from 'test.zarr'...\n"
         "Identifying empty blocks...\n"
         "Deleting 24 empty block file(s) for variable 'precipitation'...\n"
         "Deleting 24 empty block file(s) for variable 'temperature'...\n"
         "Done, 48 block file(s) deleted.\n", result.stdout)
     expected_file_names = sorted([
         '.zarray', '.zattrs', '0.0.0', '0.0.1', '0.0.2', '0.0.3', '0.1.0',
         '0.1.1', '0.1.2', '0.1.3', '1.0.0', '1.0.1', '1.0.2', '1.0.3',
         '1.1.0', '1.1.1', '1.1.2', '1.1.3', '2.0.0', '2.0.1', '2.0.2',
         '2.0.3', '2.1.0', '2.1.1', '2.1.2', '2.1.3'
     ])
     self.assertEqual(expected_file_names,
                      sorted(os.listdir('test.zarr/precipitation')))
     self.assertEqual(expected_file_names,
                      sorted(os.listdir('test.zarr/temperature')))
     ds = xr.open_zarr('test.zarr')
     assert_cube(ds)
     self.assertIn('precipitation', ds)
     self.assertEqual((3, 180, 360), ds.precipitation.shape)
     self.assertEqual(('time', 'lat', 'lon'), ds.precipitation.dims)
     self.assertIn('temperature', ds)
     self.assertEqual((3, 180, 360), ds.temperature.shape)
     self.assertEqual(('time', 'lat', 'lon'), ds.temperature.dims)
Exemplo n.º 2
0
def open_dataset(input_path: str,
                 format_name: str = None,
                 is_cube: bool = False,
                 **kwargs) -> xr.Dataset:
    """
    Open a dataset from *input_path*.
    If *format* is not provided it will be guessed from *output_path*.
    :param input_path: input path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param is_cube: Whether a ValueError will be raised, if the dataset read from *input_path* is not a xcube dataset.
    :param kwargs: format-specific keyword arguments
    :return: dataset object
    """
    format_name = format_name if format_name else guess_dataset_format(
        input_path)
    if format_name is None:
        raise ValueError("Unknown input format")
    dataset_io = find_dataset_io(format_name, modes=["r"])
    if dataset_io is None:
        raise ValueError(
            f"Unknown input format {format_name!r} for {input_path}")
    dataset = dataset_io.read(input_path, **kwargs)
    if is_cube:
        assert_cube(dataset)
    return dataset
Exemplo n.º 3
0
def open_ml_dataset_from_local_fs(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    if not os.path.isabs(path):
        path = os.path.join(ctx.base_dir, path)

    data_format = dataset_descriptor.get('Format', guess_cube_format(path))

    if data_format == FORMAT_NAME_NETCDF4:
        with measure_time(tag=f"opened local NetCDF dataset {path}"):
            ds = assert_cube(xr.open_dataset(path))
            return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_ZARR:
        with measure_time(tag=f"opened local zarr dataset {path}"):
            ds = assert_cube(xr.open_zarr(path))
            return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened local levels dataset {path}"):
            return FileStorageMultiLevelDataset(path)

    raise ServiceConfigError(
        f"Illegal data format {data_format!r} for dataset {ds_id}")
Exemplo n.º 4
0
def vars_to_dim(cube: xr.Dataset,
                dim_name: str = 'var',
                var_name='data',
                cube_asserted: bool = False):
    """
    Convert data variables into a dimension.

    :param cube: The xcube dataset.
    :param dim_name: The name of the new dimension and coordinate variable. Defaults to 'var'.
    :param var_name: The name of the new, single data variable. Defaults to 'data'.
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new xcube dataset with data variables turned into a new dimension.
    """

    if not cube_asserted:
        assert_cube(cube)

    if var_name == dim_name:
        raise ValueError("var_name must be different from dim_name")

    data_var_names = [data_var_name for data_var_name in cube.data_vars]
    if not data_var_names:
        raise ValueError("cube must not be empty")

    da = xr.concat([cube[data_var_name] for data_var_name in data_var_names],
                   dim_name)
    new_coord_var = xr.DataArray(data_var_names, dims=[dim_name])
    da = da.assign_coords(**{dim_name: new_coord_var})

    return xr.Dataset(dict(**{var_name: da}))
Exemplo n.º 5
0
 def test_no_dry_run(self):
     result = self.invoke_cli(['prune', self.TEST_CUBE, "-vv"])
     self.assertEqual(0, result.exit_code)
     self.assertEqual(
         (
             "Opening dataset from 'test.zarr'...\n"
             "Identifying empty chunks...\n"
             "Found empty chunks in variable 'precipitation', "
             "deleting block files...\n"
             "Deleted 24 block file(s) for variable 'precipitation'.\n"
             "Found empty chunks in variable 'temperature', "
             "deleting block files...\n"
             "Deleted 24 block file(s) for variable 'temperature'.\n"
             "Done, 48 block file(s) deleted total.\n"
         ),
         result.stdout)
     expected_file_names = sorted(['.zarray', '.zattrs'])
     self.assertEqual(expected_file_names,
                      sorted(os.listdir('test.zarr/precipitation')))
     self.assertEqual(expected_file_names,
                      sorted(os.listdir('test.zarr/temperature')))
     ds = xr.open_zarr('test.zarr')
     assert_cube(ds)
     self.assertIn('precipitation', ds)
     self.assertEqual((3, 180, 360), ds.precipitation.shape)
     self.assertEqual(('time', 'lat', 'lon'), ds.precipitation.dims)
     self.assertIn('temperature', ds)
     self.assertEqual((3, 180, 360), ds.temperature.shape)
     self.assertEqual(('time', 'lat', 'lon'), ds.temperature.dims)
Exemplo n.º 6
0
    def test_rectify_multiple_comma_separated_vars(self):
        """Test that rectify selects the desired variables when
        multiple --var options, some with multiple comma-separated
        variable names as an argument, are passed."""

        # For now, specify the image geometry explicitly with --size, --point,
        # and --res to avoid triggering an "invalid y_min" ValueError when
        # ImageGeom tries to determine it automatically. Once Issue #303 has
        # been fixed, these options can be omitted.

        result = self.invoke_cli([
            'rectify', '--size', '508,253', '--point', '-179.5,-89.5', '--res',
            '0.7071067811865475', '--var', 'precipitation,temperature',
            '--var', 'soil_moisture', TEST_ZARR_DIR
        ])
        self.assertEqual(0, result.exit_code)
        self.assertEqual(
            'Opening dataset from \'test.zarr\'...\n'
            'Rectifying...\n'
            'Writing rectified dataset to \'out.zarr\'...\n'
            'Done.\n', result.stdout)
        self.assertTrue(os.path.isdir('out.zarr'))
        ds = xr.open_zarr('out.zarr')
        assert_cube(ds)
        self.assertIn('precipitation', ds)
        self.assertIn('temperature', ds)
        self.assertIn('soil_moisture', ds)
Exemplo n.º 7
0
 def test_with_vars(self):
     result = self.invoke_cli(['resample', TEST_ZARR_DIR, '--vars', 'temperature,precipitation'])
     self.assertEqual(0, result.exit_code)
     self.assertTrue(os.path.isdir('out.zarr'))
     ds = xr.open_zarr('out.zarr')
     assert_cube(ds)
     self.assertIn('precipitation_mean', ds)
     self.assertIn('temperature_mean', ds)
     self.assertNotIn('soil_moisture_mean', ds)
Exemplo n.º 8
0
def get_cube_values_for_points(cube: xr.Dataset,
                               points: Union[xr.Dataset, pd.DataFrame, Mapping[str, Any]],
                               var_names: Sequence[str] = None,
                               include_coords: bool = False,
                               include_bounds: bool = False,
                               include_indexes: bool = False,
                               index_name_pattern: str = DEFAULT_INDEX_NAME_PATTERN,
                               include_refs: bool = False,
                               ref_name_pattern: str = DEFAULT_REF_NAME_PATTERN,
                               method: str = DEFAULT_INTERP_POINT_METHOD,
                               cube_asserted: bool = False) -> xr.Dataset:
    """
    Extract values from *cube* variables at given coordinates in *points*.

    :param cube: The cube dataset.
    :param points: Dictionary that maps dimension name to coordinate arrays.
    :param var_names: An optional list of names of data variables in *cube* whose values shall be extracted.
    :param include_coords: Whether to include the cube coordinates for each point in return value.
    :param include_bounds: Whether to include the cube coordinate boundaries (if any) for each point in return value.
    :param include_indexes: Whether to include computed indexes into the cube for each point in return value.
    :param index_name_pattern: A naming pattern for the computed index columns.
           Must include "{name}" which will be replaced by the index' dimension name.
    :param include_refs: Whether to include point (reference) values in return value.
    :param ref_name_pattern: A naming pattern for the computed point data columns.
           Must include "{name}" which will be replaced by the point's attribute name.
    :param method: "nearest" or "linear".
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new data frame whose columns are values from *cube* variables at given *points*.
    """
    if not cube_asserted:
        assert_cube(cube)

    point_indexes = get_cube_point_indexes(cube,
                                           points,
                                           index_name_pattern=index_name_pattern,
                                           index_dtype=np.int64 if method == POINT_INTERP_METHOD_NEAREST else np.float64,
                                           cube_asserted=True)

    cube_values = get_cube_values_for_indexes(cube,
                                              point_indexes,
                                              include_coords,
                                              include_bounds,
                                              data_var_names=var_names,
                                              index_name_pattern=index_name_pattern,
                                              method=method,
                                              cube_asserted=True)

    if include_indexes:
        cube_values.update(point_indexes)

    if include_refs:
        point_refs = xr.Dataset({ref_name_pattern.format(name=name): xr.DataArray(points[name], dims=[INDEX_DIM_NAME])
                                 for name in points.keys()})
        cube_values.update(point_refs)

    return cube_values
Exemplo n.º 9
0
 def test_assert_cube_illegal_coord_var(self):
     cube = new_cube(variables=dict(precipitation=0.5))
     cube = cube.assign_coords(lat=xr.DataArray(np.outer(cube.lat, np.ones(cube.lon.size)),
                                                dims=("y", "x")),
                               lon=xr.DataArray(np.outer(np.ones(cube.lat.size), cube.lon),
                                                dims=("y", "x")))
     with self.assertRaises(ValueError) as cm:
         assert_cube(cube)
     self.assertEqual("Dataset is not a valid xcube dataset, because:\n"
                      "- missing spatial x,y coordinate variables.",
                      f"{cm.exception}")
Exemplo n.º 10
0
def get_cube_point_indexes(
        cube: xr.Dataset,
        points: PointsLike,
        dim_name_mapping: Mapping[str, str] = None,
        index_name_pattern: str = DEFAULT_INDEX_NAME_PATTERN,
        index_dtype=np.float64,
        cube_asserted: bool = False) -> xr.Dataset:
    """
    Get indexes of given point coordinates *points* into the given *dataset*.

    :param cube: The cube dataset.
    :param points: A mapping from column names to column data arrays, which
        must all have the same length.
    :param dim_name_mapping: A mapping from dimension names in *cube* to
        column names in *points*.
    :param index_name_pattern: A naming pattern for the computed
        indexes columns. Must include "{name}" which will be replaced by
        the dimension name.
    :param index_dtype: Numpy data type for the indexes. If it is a
        floating point type (default),
        then *indexes* will contain fractions, which may be
        used for interpolation.
        For out-of-range coordinates in *points*, indexes will be -1
        if *index_dtype* is an integer type, and NaN,
        if *index_dtype* is a floating point types.
    :param cube_asserted: If False, *cube* will be verified, otherwise
        it is expected to be a valid cube.
    :return: A dataset containing the index columns.
    """
    if not cube_asserted:
        assert_cube(cube)

    dim_name_mapping = dim_name_mapping if dim_name_mapping is not None else {}
    dim_names = _get_cube_data_var_dims(cube)
    col_names = [
        dim_name_mapping.get(str(dim_name), dim_name) for dim_name in dim_names
    ]

    points, _ = _normalize_series(points,
                                  col_names,
                                  force_dataset=False,
                                  param_name="points")

    indexes = []
    for dim_name, col_name in zip(dim_names, col_names):
        col = points[col_name]
        coord_indexes = get_dataset_indexes(cube,
                                            str(dim_name),
                                            col,
                                            index_dtype=index_dtype)
        indexes.append((index_name_pattern.format(name=dim_name),
                        xr.DataArray(coord_indexes, dims=[INDEX_DIM_NAME])))

    return xr.Dataset(dict(indexes))
Exemplo n.º 11
0
 def test_downsample_with_multiple_methods(self):
     result = self.invoke_cli([
         'resample', '--variables', 'temperature', '-F', 'all', '-M',
         'mean', '-M', 'count', '-M', 'prod', TEST_ZARR_DIR
     ])
     self.assertEqual(0, result.exit_code)
     self.assertTrue(os.path.isdir('out.zarr'))
     ds = xr.open_zarr('out.zarr')
     assert_cube(ds)
     self.assertIn('temperature_mean', ds)
     self.assertIn('temperature_count', ds)
     self.assertIn('temperature_prod', ds)
Exemplo n.º 12
0
 def test_all_defaults(self):
     result = self.invoke_cli(['resample', TEST_ZARR_DIR])
     self.assertEqual(0, result.exit_code)
     self.assertEqual("Opening cube from 'test.zarr'...\n"
                      "Resampling...\n"
                      "Writing resampled cube to 'out.zarr'...\n"
                      "Done.\n",
                      result.stdout)
     self.assertTrue(os.path.isdir('out.zarr'))
     ds = xr.open_zarr('out.zarr')
     assert_cube(ds)
     self.assertIn('precipitation_mean', ds)
     self.assertIn('temperature_mean', ds)
     self.assertIn('soil_moisture_mean', ds)
Exemplo n.º 13
0
 def _assert_result_ok(self, result, level_chunks: List[Tuple], output_path: str, message_regex: str):
     self.assertEqual(0, result.exit_code)
     self.assertRegex(result.stdout, message_regex)
     self.assertTrue(os.path.isdir(output_path))
     level_datasets = read_levels(output_path)
     level = 0
     for level_dataset in level_datasets:
         assert_cube(level_dataset)
         self.assertEqual({'precipitation', 'soil_moisture', 'temperature'},
                          set(level_dataset.data_vars.keys()))
         for var_name, var in level_dataset.data_vars.items():
             var_chunks = level_chunks[level]
             self.assertEqual(var_chunks, var.chunks, f'{var_name} at level {level}')
         level += 1
Exemplo n.º 14
0
 def test_upsample_with_multiple_methods(self):
     result = self.invoke_cli(['resample',
                               '--variables', 'temperature',
                               '-F', '12H',
                               '-T', '6H',
                               # '-K', 'quadratic',
                               # '-M', 'interpolate',
                               '-M', 'nearest',
                               TEST_ZARR_DIR])
     self.assertEqual(0, result.exit_code)
     self.assertTrue(os.path.isdir('out.zarr'))
     ds = xr.open_zarr('out.zarr')
     assert_cube(ds)
     # self.assertIn('temperature_interpolate', ds)
     self.assertIn('temperature_nearest', ds)
Exemplo n.º 15
0
 def test_assert_cube_illegal_data_var(self):
     cube = new_cube(variables=dict(precipitation=0.5))
     shape = cube.dims["lat"], cube.dims["lon"]
     cube["chl"] = xr.DataArray(np.random.rand(*shape),
                                dims=("lat", "lon"),
                                coords=dict(lat=cube.lat, lon=cube.lon))
     with self.assertRaises(ValueError) as cm:
         assert_cube(cube)
     self.assertEqual("Dataset is not a valid xcube dataset, because:\n"
                      "- dimensions of data variable 'chl' must be"
                      " ('time', ..., 'lat', 'lon'), but were ('lat', 'lon') for 'chl';\n"
                      "- dimensions of all data variables must be same,"
                      " but found ('time', 'lat', 'lon') for 'precipitation'"
                      " and ('lat', 'lon') for 'chl'.",
                      f"{cm.exception}")
Exemplo n.º 16
0
    def _get_dataset_lazily(self, index: int,
                            parameters: Dict[str, Any]) -> xr.Dataset:
        """
        Read the dataset for the level at given *index*.

        :param index: the level index
        :param parameters: keyword arguments passed to xr.open_zarr()
        :return: the dataset for the level at *index*.
        """
        ext, level_path = self._level_paths[index]
        if ext == ".link":
            with self._s3_file_system.open(level_path, "w") as fp:
                level_path = fp.read()
                # if file_path is a relative path, resolve it against the levels directory
                if not os.path.isabs(level_path):
                    base_dir = os.path.dirname(self._dir_path)
                    level_path = os.path.join(base_dir, level_path)
        store = s3fs.S3Map(root=level_path,
                           s3=self._s3_file_system,
                           check=False)
        max_size = self.get_chunk_cache_capacity(index)
        if max_size:
            store = zarr.LRUStoreCache(store, max_size=max_size)
        with measure_time(
                tag=f"opened remote dataset {level_path} for level {index}"):
            consolidated = self._s3_file_system.exists(
                f'{level_path}/.zmetadata')
            return assert_cube(xr.open_zarr(store,
                                            consolidated=consolidated,
                                            **parameters),
                               name=level_path)
Exemplo n.º 17
0
 def test_assert_cube_illegal_coord_bounds_var(self):
     cube = new_cube(variables=dict(precipitation=0.5))
     lat_bnds = np.zeros((cube.time.size, cube.lat.size, 2))
     lon_bnds = np.zeros((cube.time.size, cube.lon.size, 2), dtype=np.float16)
     lat_bnds[:, :, :] = cube.lat_bnds
     lon_bnds[:, :, :] = cube.lon_bnds
     cube = cube.assign_coords(lat_bnds=xr.DataArray(lat_bnds, dims=("time", "lat", "bnds")),
                               lon_bnds=xr.DataArray(lon_bnds, dims=("time", "lon", "bnds")))
     with self.assertRaises(ValueError) as cm:
         assert_cube(cube)
     self.assertEqual("Dataset is not a valid xcube dataset, because:\n"
                      "- bounds coordinate variable 'lon_bnds' must have dimensions ('lon', <bounds_dim>);\n"
                      "- shape of bounds coordinate variable 'lon_bnds' must be (360, 2) but was (5, 360, 2);\n"
                      "- type of bounds coordinate variable 'lon_bnds' must be dtype('float64') but was dtype('float16');\n"
                      "- bounds coordinate variable 'lat_bnds' must have dimensions ('lat', <bounds_dim>);\n"
                      "- shape of bounds coordinate variable 'lat_bnds' must be (180, 2) but was (5, 180, 2).",
                      f"{cm.exception}")
Exemplo n.º 18
0
def write_cube(cube: xr.Dataset,
               output_path: str,
               format_name: str = None,
               cube_asserted: bool = False,
               **kwargs) -> xr.Dataset:
    """
    Write a xcube dataset to *output_path*.
    If *format* is not provided it will be guessed from *output_path*.
    :param cube: xcube dataset to be written.
    :param output_path: output path
    :param format_name: format, e.g. "zarr" or "netcdf4"
    :param kwargs: format-specific keyword arguments
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: xcube dataset *cube*
    """
    if not cube_asserted:
        assert_cube(cube)
    return write_dataset(cube, output_path, format_name=format_name, **kwargs)
Exemplo n.º 19
0
def cubify_dataset(ds: xr.Dataset) -> xr.Dataset:
    """
    Normalize the geo- and time-coding upon opening the given
    dataset w.r.t. a common (CF-compatible) convention.

    Will throw a value error if the dataset could not not be
    converted to a cube.
    """
    ds = normalize_dataset(ds)
    return assert_cube(ds)
Exemplo n.º 20
0
def open_ml_dataset_from_local_fs(path: str,
                                  data_format: str = None,
                                  ds_id: str = None,
                                  exception_type: type = ValueError,
                                  **kwargs) -> MultiLevelDataset:
    data_format = data_format or guess_ml_dataset_format(path)

    if data_format == FORMAT_NAME_NETCDF4:
        with measure_time(tag=f"opened local NetCDF dataset {path}"):
            ds = assert_cube(xr.open_dataset(path, **kwargs))
            return BaseMultiLevelDataset(ds, ds_id=ds_id)
    elif data_format == FORMAT_NAME_ZARR:
        with measure_time(tag=f"opened local zarr dataset {path}"):
            ds = assert_cube(xr.open_zarr(path, **kwargs))
            return BaseMultiLevelDataset(ds, ds_id=ds_id)
    elif data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened local levels dataset {path}"):
            return FileStorageMultiLevelDataset(path,
                                                ds_id=ds_id,
                                                zarr_kwargs=kwargs)

    raise exception_type(
        f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}'
    )
Exemplo n.º 21
0
def open_ml_dataset_from_object_storage(path: str,
                                        data_format: str = None,
                                        ds_id: str = None,
                                        exception_type: type = ValueError,
                                        client_kwargs: Mapping[str,
                                                               Any] = None,
                                        **kwargs) -> MultiLevelDataset:
    data_format = data_format or guess_ml_dataset_format(path)

    endpoint_url, root = split_bucket_url(path)
    if endpoint_url:
        kwargs['endpoint_url'] = endpoint_url
        path = root

    client_kwargs = dict(client_kwargs or {})
    for arg_name in ['endpoint_url', 'region_name']:
        if arg_name in kwargs:
            client_kwargs[arg_name] = kwargs.pop(arg_name)

    obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs)

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=path, s3=obs_file_system, check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            consolidated = obs_file_system.exists(f'{path}/.zmetadata')
            ds = assert_cube(
                xr.open_zarr(cached_store, consolidated=consolidated,
                             **kwargs))
        return BaseMultiLevelDataset(ds, ds_id=ds_id)
    elif data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                obs_file_system,
                path,
                zarr_kwargs=kwargs,
                ds_id=ds_id,
                exception_type=exception_type)

    raise exception_type(
        f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}'
    )
Exemplo n.º 22
0
    def _get_dataset_lazily(self, index: int, **zarr_kwargs) -> xr.Dataset:
        """
        Read the dataset for the level at given *index*.

        :param index: the level index
        :param zarr_kwargs: kwargs passed to xr.open_zarr()
        :return: the dataset for the level at *index*.
        """
        ext, level_path = self._level_paths[index]
        if ext == ".link":
            with open(level_path, "r") as fp:
                level_path = fp.read()
                # if file_path is a relative path, resolve it against the levels directory
                if not os.path.isabs(level_path):
                    base_dir = os.path.dirname(self._dir_path)
                    level_path = os.path.join(base_dir, level_path)
        with measure_time(
                tag=f"opened local dataset {level_path} for level {index}"):
            return assert_cube(xr.open_zarr(level_path, **zarr_kwargs),
                               name=level_path)
Exemplo n.º 23
0
 def _get_dataset_lazily(self, index: int, **kwargs) -> xr.Dataset:
     input_datasets = [
         self._input_ml_dataset_getter(ds_id).get_dataset(index)
         for ds_id in self._input_ml_dataset_ids
     ]
     try:
         with measure_time(
                 tag=
                 f"computed in-memory dataset {self._ds_id!r} at level {index}"
         ):
             computed_value = self._callable_obj(*input_datasets, **kwargs)
     except Exception as e:
         raise self._exception_type(
             f"Failed to compute in-memory dataset {self._ds_id!r} at level {index} "
             f"from function {self._callable_name!r}: {e}") from e
     if not isinstance(computed_value, xr.Dataset):
         raise self._exception_type(
             f"Failed to compute in-memory dataset {self._ds_id!r} at level {index} "
             f"from function {self._callable_name!r}: "
             f"expected an xarray.Dataset but got {type(computed_value)}")
     return assert_cube(computed_value, name=self._ds_id)
Exemplo n.º 24
0
def open_ml_dataset_from_object_storage(path: str,
                                        data_format: str = None,
                                        ds_id: str = None,
                                        exception_type: type = ValueError,
                                        s3_kwargs: Mapping[str, Any] = None,
                                        s3_client_kwargs: Mapping[str,
                                                                  Any] = None,
                                        chunk_cache_capacity: int = None,
                                        **kwargs) -> MultiLevelDataset:
    data_format = data_format or guess_ml_dataset_format(path)

    s3, root = parse_s3_fs_and_root(path,
                                    s3_kwargs=s3_kwargs,
                                    s3_client_kwargs=s3_client_kwargs,
                                    mode='r')

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=root, s3=s3, check=False)
        if chunk_cache_capacity:
            store = zarr.LRUStoreCache(store, max_size=chunk_cache_capacity)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            consolidated = s3.exists(f'{root}/.zmetadata')
            ds = assert_cube(
                xr.open_zarr(store, consolidated=consolidated, **kwargs))
        return BaseMultiLevelDataset(ds, ds_id=ds_id)
    elif data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                s3,
                root,
                zarr_kwargs=kwargs,
                ds_id=ds_id,
                chunk_cache_capacity=chunk_cache_capacity,
                exception_type=exception_type)

    raise exception_type(
        f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}'
    )
Exemplo n.º 25
0
def open_ml_dataset_from_object_storage(
        ctx: ServiceContext,
        dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset:
    ds_id = dataset_descriptor.get('Identifier')

    path = dataset_descriptor.get('Path')
    if not path:
        raise ServiceConfigError(
            f"Missing 'path' entry in dataset descriptor {ds_id}")

    data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR)

    s3_client_kwargs = {}
    if 'Endpoint' in dataset_descriptor:
        s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint']
    if 'Region' in dataset_descriptor:
        s3_client_kwargs['region_name'] = dataset_descriptor['Region']
    obs_file_system = s3fs.S3FileSystem(anon=True,
                                        client_kwargs=s3_client_kwargs)

    if data_format == FORMAT_NAME_ZARR:
        store = s3fs.S3Map(root=path, s3=obs_file_system, check=False)
        cached_store = zarr.LRUStoreCache(store, max_size=2**28)
        with measure_time(tag=f"opened remote zarr dataset {path}"):
            consolidated = obs_file_system.exists(f'{path}/.zmetadata')
            ds = assert_cube(
                xr.open_zarr(cached_store, consolidated=consolidated))
        return BaseMultiLevelDataset(ds)

    if data_format == FORMAT_NAME_LEVELS:
        with measure_time(tag=f"opened remote levels dataset {path}"):
            return ObjectStorageMultiLevelDataset(
                ds_id,
                obs_file_system,
                path,
                exception_type=ServiceConfigError)
Exemplo n.º 26
0
def get_cube_values_for_indexes(
        cube: xr.Dataset,
        indexes: Union[xr.Dataset, pd.DataFrame, Mapping[str, Any]],
        include_coords: bool = False,
        include_bounds: bool = False,
        data_var_names: Sequence[str] = None,
        index_name_pattern: str = DEFAULT_INDEX_NAME_PATTERN,
        method: str = DEFAULT_INTERP_POINT_METHOD,
        cube_asserted: bool = False) -> xr.Dataset:
    """
    Get values from the *cube* at given *indexes*.

    :param cube: A cube dataset.
    :param indexes: A mapping from column names to index and fraction arrays for all cube dimensions.
    :param include_coords: Whether to include the cube coordinates for each point in return value.
    :param include_bounds: Whether to include the cube coordinate boundaries (if any) for each point in return value.
    :param data_var_names: An optional list of names of data variables in *cube* whose values shall be extracted.
    :param index_name_pattern: A naming pattern for the computed indexes columns.
           Must include "{name}" which will be replaced by the dimension name.
    :param method: "nearest" or "linear".
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new data frame whose columns are values from *cube* variables at given *indexes*.
    """
    if not cube_asserted:
        assert_cube(cube)

    if method not in {POINT_INTERP_METHOD_NEAREST, POINT_INTERP_METHOD_LINEAR}:
        raise ValueError(f"invalid method {method!r}")
    if method != POINT_INTERP_METHOD_NEAREST:
        raise NotImplementedError(f"method {method!r} not yet implemented")

    all_data_var_names = tuple(cube.data_vars.keys())
    if len(all_data_var_names) == 0:
        raise ValueError("cube is empty")

    if data_var_names is not None:
        if len(data_var_names) == 0:
            return xr.Dataset(
                coords=indexes.coords if hasattr(indexes, "coords") else None)
        for var_name in data_var_names:
            if var_name not in cube.data_vars:
                raise ValueError(f"variable {var_name!r} not found in cube")
    else:
        data_var_names = all_data_var_names

    dim_names = cube[data_var_names[0]].dims
    num_dims = len(dim_names)
    index_names = [
        index_name_pattern.format(name=dim_name) for dim_name in dim_names
    ]
    num_points = _validate_points(indexes, index_names, param_name="indexes")
    indexes = _normalize_points(indexes)

    cube = xr.Dataset(
        {var_name: cube[var_name]
         for var_name in data_var_names},
        coords=cube.coords)

    new_bounds_vars = {}
    bounds_var_names = _get_coord_bounds_var_names(cube)
    drop_coords = None
    if bounds_var_names:
        if include_bounds:
            # Flatten any coordinate bounds variables
            for var_name, bnds_var_name in bounds_var_names.items():
                bnds_var = cube[bnds_var_name]
                new_bounds_vars[f"{var_name}_lower"] = bnds_var[:, 0]
                new_bounds_vars[f"{var_name}_upper"] = bnds_var[:, 1]
            cube = cube.assign_coords(**new_bounds_vars)
        cube = cube.drop_vars(bounds_var_names.values())
        if not include_coords:
            drop_coords = set(cube.coords).difference(new_bounds_vars.keys())
    else:
        if not include_coords:
            drop_coords = set(cube.coords)

    # Generate a validation condition so we can filter out invalid rows (where any index == -1)
    is_valid_point = None
    for index_name in index_names:
        col = indexes[index_name]
        condition = col >= 0 if np.issubdtype(col.dtype,
                                              np.integer) else np.isnan(col)
        if is_valid_point is None:
            is_valid_point = condition
        else:
            is_valid_point = np.logical_and(is_valid_point, condition)

    num_valid_points = np.count_nonzero(is_valid_point)
    if num_valid_points == num_points:
        # All indexes valid
        cube_selector = {
            dim_names[i]: indexes[index_names[i]]
            for i in range(num_dims)
        }
        cube_values = cube.isel(cube_selector)
    elif num_valid_points == 0:
        # All indexes are invalid
        new_bounds_vars = {}
        for var_name in cube.variables:
            new_bounds_vars[var_name] = _empty_points_var(
                cube[var_name], num_points)
        cube_values = xr.Dataset(new_bounds_vars)
    else:
        # Some invalid indexes
        idx = np.arange(num_points)
        good_idx = idx[is_valid_point.values]
        idx_dim_name = indexes[index_names[0]].dims[0]
        good_indexes = indexes.isel({idx_dim_name: good_idx})

        cube_selector = {
            dim_names[i]: good_indexes[index_names[i]]
            for i in range(num_dims)
        }
        cube_values = cube.isel(cube_selector)

        new_bounds_vars = {}
        for var_name in cube.variables:
            var = cube_values[var_name]
            new_var = _empty_points_var(var, num_points)
            new_var[good_idx] = var
            new_bounds_vars[var_name] = new_var

        cube_values = xr.Dataset(new_bounds_vars)

    if drop_coords:
        cube_values = cube_values.drop_vars(drop_coords)

    return cube_values
Exemplo n.º 27
0
 def test_assert_cube_without_bounds(self):
     cube = new_cube(variables=dict(precipitation=0.5), drop_bounds=True)
     self.assertIs(cube, assert_cube(cube))
Exemplo n.º 28
0
 def test_assert_cube_ok(self):
     cube = new_cube(variables=dict(precipitation=0.5))
     self.assertIs(cube, assert_cube(cube))
Exemplo n.º 29
0
def get_time_series(cube: xr.Dataset,
                    geometry: GeometryLike = None,
                    var_names: Sequence[str] = None,
                    start_date: Date = None,
                    end_date: Date = None,
                    include_count: bool = False,
                    include_stdev: bool = False,
                    use_groupby: bool = False,
                    cube_asserted: bool = False) -> Optional[xr.Dataset]:
    """
    Get a time series dataset from a data *cube*.

    *geometry* may be provided as a (shapely) geometry object, a valid GeoJSON object, a valid WKT string,
    a sequence of box coordinates (x1, y1, x2, y2), or point coordinates (x, y). If *geometry* covers an area,
    i.e. is not a point, the function aggregates the variables to compute a mean value and if desired,
    the number of valid observations and the standard deviation.

    *start_date* and *end_date* may be provided as a numpy.datetime64 or an ISO datetime string.

    Returns a time-series dataset whose data variables have a time dimension but no longer have spatial dimensions,
    hence the resulting dataset's variables will only have N-2 dimensions.
    A global attribute ``max_number_of_observations`` will be set to the maximum number of observations
    that could have been made in each time step.
    If the given *geometry* does not overlap the cube's boundaries, or if not output variables remain,
    the function returns ``None``.

    :param cube: The xcube dataset
    :param geometry: Optional geometry
    :param var_names: Optional sequence of names of variables to be included.
    :param start_date: Optional start date.
    :param end_date: Optional end date.
    :param include_count: Whether to include the number of valid observations for each time step.
           Ignored if geometry is a point.
    :param include_stdev: Whether to include standard deviation for each time step.
           Ignored if geometry is a point.
    :param use_groupby: Use group-by operation. May increase or decrease runtime performance and/or memory consumption.
    :param cube_asserted:  If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new dataset with time-series for each variable.
    """

    if not cube_asserted:
        assert_cube(cube)

    geometry = convert_geometry(geometry)

    dataset = select_variables_subset(cube, var_names)
    if len(dataset.data_vars) == 0:
        return None

    if start_date is not None or end_date is not None:
        # noinspection PyTypeChecker
        dataset = dataset.sel(time=slice(start_date, end_date))

    if isinstance(geometry, shapely.geometry.Point):
        bounds = get_dataset_geometry(dataset)
        if not bounds.contains(geometry):
            return None
        dataset = dataset.sel(lon=geometry.x, lat=geometry.y, method='Nearest')
        return dataset.assign_attrs(max_number_of_observations=1)

    if geometry is not None:
        dataset = mask_dataset_by_geometry(dataset,
                                           geometry,
                                           save_geometry_mask='__mask__')
        if dataset is None:
            return None
        mask = dataset['__mask__']
        max_number_of_observations = np.count_nonzero(mask)
        dataset = dataset.drop('__mask__')
    else:
        max_number_of_observations = dataset.lat.size * dataset.lon.size

    ds_count = None
    ds_stdev = None
    if use_groupby:
        time_group = dataset.groupby('time')
        ds_mean = time_group.mean(skipna=True, dim=xr.ALL_DIMS)
        if include_count:
            ds_count = time_group.count(dim=xr.ALL_DIMS)
        if include_stdev:
            ds_stdev = time_group.std(skipna=True, dim=xr.ALL_DIMS)
    else:
        ds_mean = dataset.mean(dim=('lat', 'lon'), skipna=True)
        if include_count:
            ds_count = dataset.count(dim=('lat', 'lon'))
        if include_stdev:
            ds_stdev = dataset.std(dim=('lat', 'lon'), skipna=True)

    if ds_count is not None:
        ds_count = ds_count.rename(
            name_dict=dict({v: f"{v}_count"
                            for v in ds_count.data_vars}))

    if ds_stdev is not None:
        ds_stdev = ds_stdev.rename(
            name_dict=dict({v: f"{v}_stdev"
                            for v in ds_stdev.data_vars}))

    if ds_count is not None and ds_stdev is not None:
        ts_dataset = xr.merge([ds_mean, ds_stdev, ds_count])
    elif ds_count is not None:
        ts_dataset = xr.merge([ds_mean, ds_count])
    elif ds_stdev is not None:
        ts_dataset = xr.merge([ds_mean, ds_stdev])
    else:
        ts_dataset = ds_mean

    ts_dataset = ts_dataset.assign_attrs(
        max_number_of_observations=max_number_of_observations)

    return ts_dataset
Exemplo n.º 30
0
def resample_in_time(dataset: xr.Dataset,
                     frequency: str,
                     method: Union[str, Sequence[str]],
                     offset=None,
                     base: int = 0,
                     tolerance=None,
                     interp_kind=None,
                     time_chunk_size=None,
                     var_names: Sequence[str] = None,
                     metadata: Dict[str, Any] = None,
                     cube_asserted: bool = False) -> xr.Dataset:
    """
    Resample a dataset in the time dimension.

    The argument *method* may be one or a sequence of
    ``'all'``, ``'any'``,
    ``'argmax'``, ``'argmin'``, ``'count'``,
    ``'first'``, ``'last'``,
    ``'max'``, ``'min'``, ``'mean'``, ``'median'``,
    ``'percentile_<p>'``,
    ``'std'``, ``'sum'``, ``'var'``.

    In value ``'percentile_<p>'`` is a placeholder,
    where ``'<p>'`` must be replaced by an integer percentage
    value, e.g. ``'percentile_90'`` is the 90%-percentile.

    *Important note:* As of xarray 0.14 and dask 2.8, the
    methods ``'median'`` and ``'percentile_<p>'` cannot be
    used if the variables in *cube* comprise chunked dask arrays.
    In this case, use the ``compute()`` or ``load()`` method
    to convert dask arrays into numpy arrays.

    :param dataset: The xcube dataset.
    :param frequency: Temporal aggregation frequency.
        Use format "<count><offset>" where <offset> is one of
        'H', 'D', 'W', 'M', 'Q', 'Y'.
    :param method: Resampling method or sequence of
        resampling methods.
    :param offset: Offset used to adjust the resampled time labels.
        Uses same syntax as *frequency*.
    :param base: For frequencies that evenly subdivide 1 day,
        the "origin" of the aggregated intervals. For example,
        for '24H' frequency, base could range from 0 through 23.
    :param time_chunk_size: If not None, the chunk size to be
        used for the "time" dimension.
    :param var_names: Variable names to include.
    :param tolerance: Time tolerance for selective
        upsampling methods. Defaults to *frequency*.
    :param interp_kind: Kind of interpolation
        if *method* is 'interpolation'.
    :param metadata: Output metadata.
    :param cube_asserted: If False, *cube* will be verified,
        otherwise it is expected to be a valid cube.
    :return: A new xcube dataset resampled in time.
    """
    if not cube_asserted:
        assert_cube(dataset)

    if frequency == 'all':
        time_gap = np.array(dataset.time[-1]) - np.array(dataset.time[0])
        days = int((np.timedelta64(time_gap, 'D') / np.timedelta64(1, 'D')) +
                   1)
        frequency = f'{days}D'

    if var_names:
        dataset = select_variables_subset(dataset, var_names)

    resampler = dataset.resample(skipna=True,
                                 closed='left',
                                 label='left',
                                 time=frequency,
                                 loffset=offset,
                                 base=base)

    if isinstance(method, str):
        methods = [method]
    else:
        methods = list(method)

    percentile_prefix = 'percentile_'

    resampled_cubes = []
    for method in methods:
        method_args = []
        method_postfix = method
        if method.startswith(percentile_prefix):
            p = int(method[len(percentile_prefix):])
            q = p / 100.0
            method_args = [q]
            method_postfix = f'p{p}'
            method = 'quantile'
        resampling_method = getattr(resampler, method)
        method_kwargs = get_method_kwargs(method, frequency, interp_kind,
                                          tolerance)
        resampled_cube = resampling_method(*method_args, **method_kwargs)
        resampled_cube = resampled_cube.rename({
            var_name: f'{var_name}_{method_postfix}'
            for var_name in resampled_cube.data_vars
        })
        resampled_cubes.append(resampled_cube)

    if len(resampled_cubes) == 1:
        resampled_cube = resampled_cubes[0]
    else:
        resampled_cube = xr.merge(resampled_cubes)

    # TODO: add time_bnds to resampled_ds
    time_coverage_start = '%s' % dataset.time[0]
    time_coverage_end = '%s' % dataset.time[-1]

    resampled_cube.attrs.update(metadata or {})
    # TODO: add other time_coverage_ attributes
    resampled_cube.attrs.update(time_coverage_start=time_coverage_start,
                                time_coverage_end=time_coverage_end)

    schema = CubeSchema.new(dataset)
    chunk_sizes = {
        schema.dims[i]: schema.chunks[i]
        for i in range(schema.ndim)
    }

    if isinstance(time_chunk_size, int) and time_chunk_size >= 0:
        chunk_sizes['time'] = time_chunk_size

    return resampled_cube.chunk(chunk_sizes)