def test_dry_run(self): result = self.invoke_cli(['prune', self.TEST_CUBE, "--dry-run"]) self.assertEqual(0, result.exit_code) self.assertEqual( "Opening cube from 'test.zarr'...\n" "Identifying empty blocks...\n" "Deleting 24 empty block file(s) for variable 'precipitation'...\n" "Deleting 24 empty block file(s) for variable 'temperature'...\n" "Done, 48 block file(s) deleted.\n", result.stdout) expected_file_names = sorted([ '.zarray', '.zattrs', '0.0.0', '0.0.1', '0.0.2', '0.0.3', '0.1.0', '0.1.1', '0.1.2', '0.1.3', '1.0.0', '1.0.1', '1.0.2', '1.0.3', '1.1.0', '1.1.1', '1.1.2', '1.1.3', '2.0.0', '2.0.1', '2.0.2', '2.0.3', '2.1.0', '2.1.1', '2.1.2', '2.1.3' ]) self.assertEqual(expected_file_names, sorted(os.listdir('test.zarr/precipitation'))) self.assertEqual(expected_file_names, sorted(os.listdir('test.zarr/temperature'))) ds = xr.open_zarr('test.zarr') assert_cube(ds) self.assertIn('precipitation', ds) self.assertEqual((3, 180, 360), ds.precipitation.shape) self.assertEqual(('time', 'lat', 'lon'), ds.precipitation.dims) self.assertIn('temperature', ds) self.assertEqual((3, 180, 360), ds.temperature.shape) self.assertEqual(('time', 'lat', 'lon'), ds.temperature.dims)
def open_dataset(input_path: str, format_name: str = None, is_cube: bool = False, **kwargs) -> xr.Dataset: """ Open a dataset from *input_path*. If *format* is not provided it will be guessed from *output_path*. :param input_path: input path :param format_name: format, e.g. "zarr" or "netcdf4" :param is_cube: Whether a ValueError will be raised, if the dataset read from *input_path* is not a xcube dataset. :param kwargs: format-specific keyword arguments :return: dataset object """ format_name = format_name if format_name else guess_dataset_format( input_path) if format_name is None: raise ValueError("Unknown input format") dataset_io = find_dataset_io(format_name, modes=["r"]) if dataset_io is None: raise ValueError( f"Unknown input format {format_name!r} for {input_path}") dataset = dataset_io.read(input_path, **kwargs) if is_cube: assert_cube(dataset) return dataset
def open_ml_dataset_from_local_fs( ctx: ServiceContext, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_id}") if not os.path.isabs(path): path = os.path.join(ctx.base_dir, path) data_format = dataset_descriptor.get('Format', guess_cube_format(path)) if data_format == FORMAT_NAME_NETCDF4: with measure_time(tag=f"opened local NetCDF dataset {path}"): ds = assert_cube(xr.open_dataset(path)) return BaseMultiLevelDataset(ds) if data_format == FORMAT_NAME_ZARR: with measure_time(tag=f"opened local zarr dataset {path}"): ds = assert_cube(xr.open_zarr(path)) return BaseMultiLevelDataset(ds) if data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened local levels dataset {path}"): return FileStorageMultiLevelDataset(path) raise ServiceConfigError( f"Illegal data format {data_format!r} for dataset {ds_id}")
def vars_to_dim(cube: xr.Dataset, dim_name: str = 'var', var_name='data', cube_asserted: bool = False): """ Convert data variables into a dimension. :param cube: The xcube dataset. :param dim_name: The name of the new dimension and coordinate variable. Defaults to 'var'. :param var_name: The name of the new, single data variable. Defaults to 'data'. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new xcube dataset with data variables turned into a new dimension. """ if not cube_asserted: assert_cube(cube) if var_name == dim_name: raise ValueError("var_name must be different from dim_name") data_var_names = [data_var_name for data_var_name in cube.data_vars] if not data_var_names: raise ValueError("cube must not be empty") da = xr.concat([cube[data_var_name] for data_var_name in data_var_names], dim_name) new_coord_var = xr.DataArray(data_var_names, dims=[dim_name]) da = da.assign_coords(**{dim_name: new_coord_var}) return xr.Dataset(dict(**{var_name: da}))
def test_no_dry_run(self): result = self.invoke_cli(['prune', self.TEST_CUBE, "-vv"]) self.assertEqual(0, result.exit_code) self.assertEqual( ( "Opening dataset from 'test.zarr'...\n" "Identifying empty chunks...\n" "Found empty chunks in variable 'precipitation', " "deleting block files...\n" "Deleted 24 block file(s) for variable 'precipitation'.\n" "Found empty chunks in variable 'temperature', " "deleting block files...\n" "Deleted 24 block file(s) for variable 'temperature'.\n" "Done, 48 block file(s) deleted total.\n" ), result.stdout) expected_file_names = sorted(['.zarray', '.zattrs']) self.assertEqual(expected_file_names, sorted(os.listdir('test.zarr/precipitation'))) self.assertEqual(expected_file_names, sorted(os.listdir('test.zarr/temperature'))) ds = xr.open_zarr('test.zarr') assert_cube(ds) self.assertIn('precipitation', ds) self.assertEqual((3, 180, 360), ds.precipitation.shape) self.assertEqual(('time', 'lat', 'lon'), ds.precipitation.dims) self.assertIn('temperature', ds) self.assertEqual((3, 180, 360), ds.temperature.shape) self.assertEqual(('time', 'lat', 'lon'), ds.temperature.dims)
def test_rectify_multiple_comma_separated_vars(self): """Test that rectify selects the desired variables when multiple --var options, some with multiple comma-separated variable names as an argument, are passed.""" # For now, specify the image geometry explicitly with --size, --point, # and --res to avoid triggering an "invalid y_min" ValueError when # ImageGeom tries to determine it automatically. Once Issue #303 has # been fixed, these options can be omitted. result = self.invoke_cli([ 'rectify', '--size', '508,253', '--point', '-179.5,-89.5', '--res', '0.7071067811865475', '--var', 'precipitation,temperature', '--var', 'soil_moisture', TEST_ZARR_DIR ]) self.assertEqual(0, result.exit_code) self.assertEqual( 'Opening dataset from \'test.zarr\'...\n' 'Rectifying...\n' 'Writing rectified dataset to \'out.zarr\'...\n' 'Done.\n', result.stdout) self.assertTrue(os.path.isdir('out.zarr')) ds = xr.open_zarr('out.zarr') assert_cube(ds) self.assertIn('precipitation', ds) self.assertIn('temperature', ds) self.assertIn('soil_moisture', ds)
def test_with_vars(self): result = self.invoke_cli(['resample', TEST_ZARR_DIR, '--vars', 'temperature,precipitation']) self.assertEqual(0, result.exit_code) self.assertTrue(os.path.isdir('out.zarr')) ds = xr.open_zarr('out.zarr') assert_cube(ds) self.assertIn('precipitation_mean', ds) self.assertIn('temperature_mean', ds) self.assertNotIn('soil_moisture_mean', ds)
def get_cube_values_for_points(cube: xr.Dataset, points: Union[xr.Dataset, pd.DataFrame, Mapping[str, Any]], var_names: Sequence[str] = None, include_coords: bool = False, include_bounds: bool = False, include_indexes: bool = False, index_name_pattern: str = DEFAULT_INDEX_NAME_PATTERN, include_refs: bool = False, ref_name_pattern: str = DEFAULT_REF_NAME_PATTERN, method: str = DEFAULT_INTERP_POINT_METHOD, cube_asserted: bool = False) -> xr.Dataset: """ Extract values from *cube* variables at given coordinates in *points*. :param cube: The cube dataset. :param points: Dictionary that maps dimension name to coordinate arrays. :param var_names: An optional list of names of data variables in *cube* whose values shall be extracted. :param include_coords: Whether to include the cube coordinates for each point in return value. :param include_bounds: Whether to include the cube coordinate boundaries (if any) for each point in return value. :param include_indexes: Whether to include computed indexes into the cube for each point in return value. :param index_name_pattern: A naming pattern for the computed index columns. Must include "{name}" which will be replaced by the index' dimension name. :param include_refs: Whether to include point (reference) values in return value. :param ref_name_pattern: A naming pattern for the computed point data columns. Must include "{name}" which will be replaced by the point's attribute name. :param method: "nearest" or "linear". :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new data frame whose columns are values from *cube* variables at given *points*. """ if not cube_asserted: assert_cube(cube) point_indexes = get_cube_point_indexes(cube, points, index_name_pattern=index_name_pattern, index_dtype=np.int64 if method == POINT_INTERP_METHOD_NEAREST else np.float64, cube_asserted=True) cube_values = get_cube_values_for_indexes(cube, point_indexes, include_coords, include_bounds, data_var_names=var_names, index_name_pattern=index_name_pattern, method=method, cube_asserted=True) if include_indexes: cube_values.update(point_indexes) if include_refs: point_refs = xr.Dataset({ref_name_pattern.format(name=name): xr.DataArray(points[name], dims=[INDEX_DIM_NAME]) for name in points.keys()}) cube_values.update(point_refs) return cube_values
def test_assert_cube_illegal_coord_var(self): cube = new_cube(variables=dict(precipitation=0.5)) cube = cube.assign_coords(lat=xr.DataArray(np.outer(cube.lat, np.ones(cube.lon.size)), dims=("y", "x")), lon=xr.DataArray(np.outer(np.ones(cube.lat.size), cube.lon), dims=("y", "x"))) with self.assertRaises(ValueError) as cm: assert_cube(cube) self.assertEqual("Dataset is not a valid xcube dataset, because:\n" "- missing spatial x,y coordinate variables.", f"{cm.exception}")
def get_cube_point_indexes( cube: xr.Dataset, points: PointsLike, dim_name_mapping: Mapping[str, str] = None, index_name_pattern: str = DEFAULT_INDEX_NAME_PATTERN, index_dtype=np.float64, cube_asserted: bool = False) -> xr.Dataset: """ Get indexes of given point coordinates *points* into the given *dataset*. :param cube: The cube dataset. :param points: A mapping from column names to column data arrays, which must all have the same length. :param dim_name_mapping: A mapping from dimension names in *cube* to column names in *points*. :param index_name_pattern: A naming pattern for the computed indexes columns. Must include "{name}" which will be replaced by the dimension name. :param index_dtype: Numpy data type for the indexes. If it is a floating point type (default), then *indexes* will contain fractions, which may be used for interpolation. For out-of-range coordinates in *points*, indexes will be -1 if *index_dtype* is an integer type, and NaN, if *index_dtype* is a floating point types. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A dataset containing the index columns. """ if not cube_asserted: assert_cube(cube) dim_name_mapping = dim_name_mapping if dim_name_mapping is not None else {} dim_names = _get_cube_data_var_dims(cube) col_names = [ dim_name_mapping.get(str(dim_name), dim_name) for dim_name in dim_names ] points, _ = _normalize_series(points, col_names, force_dataset=False, param_name="points") indexes = [] for dim_name, col_name in zip(dim_names, col_names): col = points[col_name] coord_indexes = get_dataset_indexes(cube, str(dim_name), col, index_dtype=index_dtype) indexes.append((index_name_pattern.format(name=dim_name), xr.DataArray(coord_indexes, dims=[INDEX_DIM_NAME]))) return xr.Dataset(dict(indexes))
def test_downsample_with_multiple_methods(self): result = self.invoke_cli([ 'resample', '--variables', 'temperature', '-F', 'all', '-M', 'mean', '-M', 'count', '-M', 'prod', TEST_ZARR_DIR ]) self.assertEqual(0, result.exit_code) self.assertTrue(os.path.isdir('out.zarr')) ds = xr.open_zarr('out.zarr') assert_cube(ds) self.assertIn('temperature_mean', ds) self.assertIn('temperature_count', ds) self.assertIn('temperature_prod', ds)
def test_all_defaults(self): result = self.invoke_cli(['resample', TEST_ZARR_DIR]) self.assertEqual(0, result.exit_code) self.assertEqual("Opening cube from 'test.zarr'...\n" "Resampling...\n" "Writing resampled cube to 'out.zarr'...\n" "Done.\n", result.stdout) self.assertTrue(os.path.isdir('out.zarr')) ds = xr.open_zarr('out.zarr') assert_cube(ds) self.assertIn('precipitation_mean', ds) self.assertIn('temperature_mean', ds) self.assertIn('soil_moisture_mean', ds)
def _assert_result_ok(self, result, level_chunks: List[Tuple], output_path: str, message_regex: str): self.assertEqual(0, result.exit_code) self.assertRegex(result.stdout, message_regex) self.assertTrue(os.path.isdir(output_path)) level_datasets = read_levels(output_path) level = 0 for level_dataset in level_datasets: assert_cube(level_dataset) self.assertEqual({'precipitation', 'soil_moisture', 'temperature'}, set(level_dataset.data_vars.keys())) for var_name, var in level_dataset.data_vars.items(): var_chunks = level_chunks[level] self.assertEqual(var_chunks, var.chunks, f'{var_name} at level {level}') level += 1
def test_upsample_with_multiple_methods(self): result = self.invoke_cli(['resample', '--variables', 'temperature', '-F', '12H', '-T', '6H', # '-K', 'quadratic', # '-M', 'interpolate', '-M', 'nearest', TEST_ZARR_DIR]) self.assertEqual(0, result.exit_code) self.assertTrue(os.path.isdir('out.zarr')) ds = xr.open_zarr('out.zarr') assert_cube(ds) # self.assertIn('temperature_interpolate', ds) self.assertIn('temperature_nearest', ds)
def test_assert_cube_illegal_data_var(self): cube = new_cube(variables=dict(precipitation=0.5)) shape = cube.dims["lat"], cube.dims["lon"] cube["chl"] = xr.DataArray(np.random.rand(*shape), dims=("lat", "lon"), coords=dict(lat=cube.lat, lon=cube.lon)) with self.assertRaises(ValueError) as cm: assert_cube(cube) self.assertEqual("Dataset is not a valid xcube dataset, because:\n" "- dimensions of data variable 'chl' must be" " ('time', ..., 'lat', 'lon'), but were ('lat', 'lon') for 'chl';\n" "- dimensions of all data variables must be same," " but found ('time', 'lat', 'lon') for 'precipitation'" " and ('lat', 'lon') for 'chl'.", f"{cm.exception}")
def _get_dataset_lazily(self, index: int, parameters: Dict[str, Any]) -> xr.Dataset: """ Read the dataset for the level at given *index*. :param index: the level index :param parameters: keyword arguments passed to xr.open_zarr() :return: the dataset for the level at *index*. """ ext, level_path = self._level_paths[index] if ext == ".link": with self._s3_file_system.open(level_path, "w") as fp: level_path = fp.read() # if file_path is a relative path, resolve it against the levels directory if not os.path.isabs(level_path): base_dir = os.path.dirname(self._dir_path) level_path = os.path.join(base_dir, level_path) store = s3fs.S3Map(root=level_path, s3=self._s3_file_system, check=False) max_size = self.get_chunk_cache_capacity(index) if max_size: store = zarr.LRUStoreCache(store, max_size=max_size) with measure_time( tag=f"opened remote dataset {level_path} for level {index}"): consolidated = self._s3_file_system.exists( f'{level_path}/.zmetadata') return assert_cube(xr.open_zarr(store, consolidated=consolidated, **parameters), name=level_path)
def test_assert_cube_illegal_coord_bounds_var(self): cube = new_cube(variables=dict(precipitation=0.5)) lat_bnds = np.zeros((cube.time.size, cube.lat.size, 2)) lon_bnds = np.zeros((cube.time.size, cube.lon.size, 2), dtype=np.float16) lat_bnds[:, :, :] = cube.lat_bnds lon_bnds[:, :, :] = cube.lon_bnds cube = cube.assign_coords(lat_bnds=xr.DataArray(lat_bnds, dims=("time", "lat", "bnds")), lon_bnds=xr.DataArray(lon_bnds, dims=("time", "lon", "bnds"))) with self.assertRaises(ValueError) as cm: assert_cube(cube) self.assertEqual("Dataset is not a valid xcube dataset, because:\n" "- bounds coordinate variable 'lon_bnds' must have dimensions ('lon', <bounds_dim>);\n" "- shape of bounds coordinate variable 'lon_bnds' must be (360, 2) but was (5, 360, 2);\n" "- type of bounds coordinate variable 'lon_bnds' must be dtype('float64') but was dtype('float16');\n" "- bounds coordinate variable 'lat_bnds' must have dimensions ('lat', <bounds_dim>);\n" "- shape of bounds coordinate variable 'lat_bnds' must be (180, 2) but was (5, 180, 2).", f"{cm.exception}")
def write_cube(cube: xr.Dataset, output_path: str, format_name: str = None, cube_asserted: bool = False, **kwargs) -> xr.Dataset: """ Write a xcube dataset to *output_path*. If *format* is not provided it will be guessed from *output_path*. :param cube: xcube dataset to be written. :param output_path: output path :param format_name: format, e.g. "zarr" or "netcdf4" :param kwargs: format-specific keyword arguments :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: xcube dataset *cube* """ if not cube_asserted: assert_cube(cube) return write_dataset(cube, output_path, format_name=format_name, **kwargs)
def cubify_dataset(ds: xr.Dataset) -> xr.Dataset: """ Normalize the geo- and time-coding upon opening the given dataset w.r.t. a common (CF-compatible) convention. Will throw a value error if the dataset could not not be converted to a cube. """ ds = normalize_dataset(ds) return assert_cube(ds)
def open_ml_dataset_from_local_fs(path: str, data_format: str = None, ds_id: str = None, exception_type: type = ValueError, **kwargs) -> MultiLevelDataset: data_format = data_format or guess_ml_dataset_format(path) if data_format == FORMAT_NAME_NETCDF4: with measure_time(tag=f"opened local NetCDF dataset {path}"): ds = assert_cube(xr.open_dataset(path, **kwargs)) return BaseMultiLevelDataset(ds, ds_id=ds_id) elif data_format == FORMAT_NAME_ZARR: with measure_time(tag=f"opened local zarr dataset {path}"): ds = assert_cube(xr.open_zarr(path, **kwargs)) return BaseMultiLevelDataset(ds, ds_id=ds_id) elif data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened local levels dataset {path}"): return FileStorageMultiLevelDataset(path, ds_id=ds_id, zarr_kwargs=kwargs) raise exception_type( f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}' )
def open_ml_dataset_from_object_storage(path: str, data_format: str = None, ds_id: str = None, exception_type: type = ValueError, client_kwargs: Mapping[str, Any] = None, **kwargs) -> MultiLevelDataset: data_format = data_format or guess_ml_dataset_format(path) endpoint_url, root = split_bucket_url(path) if endpoint_url: kwargs['endpoint_url'] = endpoint_url path = root client_kwargs = dict(client_kwargs or {}) for arg_name in ['endpoint_url', 'region_name']: if arg_name in kwargs: client_kwargs[arg_name] = kwargs.pop(arg_name) obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=client_kwargs) if data_format == FORMAT_NAME_ZARR: store = s3fs.S3Map(root=path, s3=obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time(tag=f"opened remote zarr dataset {path}"): consolidated = obs_file_system.exists(f'{path}/.zmetadata') ds = assert_cube( xr.open_zarr(cached_store, consolidated=consolidated, **kwargs)) return BaseMultiLevelDataset(ds, ds_id=ds_id) elif data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened remote levels dataset {path}"): return ObjectStorageMultiLevelDataset( obs_file_system, path, zarr_kwargs=kwargs, ds_id=ds_id, exception_type=exception_type) raise exception_type( f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}' )
def _get_dataset_lazily(self, index: int, **zarr_kwargs) -> xr.Dataset: """ Read the dataset for the level at given *index*. :param index: the level index :param zarr_kwargs: kwargs passed to xr.open_zarr() :return: the dataset for the level at *index*. """ ext, level_path = self._level_paths[index] if ext == ".link": with open(level_path, "r") as fp: level_path = fp.read() # if file_path is a relative path, resolve it against the levels directory if not os.path.isabs(level_path): base_dir = os.path.dirname(self._dir_path) level_path = os.path.join(base_dir, level_path) with measure_time( tag=f"opened local dataset {level_path} for level {index}"): return assert_cube(xr.open_zarr(level_path, **zarr_kwargs), name=level_path)
def _get_dataset_lazily(self, index: int, **kwargs) -> xr.Dataset: input_datasets = [ self._input_ml_dataset_getter(ds_id).get_dataset(index) for ds_id in self._input_ml_dataset_ids ] try: with measure_time( tag= f"computed in-memory dataset {self._ds_id!r} at level {index}" ): computed_value = self._callable_obj(*input_datasets, **kwargs) except Exception as e: raise self._exception_type( f"Failed to compute in-memory dataset {self._ds_id!r} at level {index} " f"from function {self._callable_name!r}: {e}") from e if not isinstance(computed_value, xr.Dataset): raise self._exception_type( f"Failed to compute in-memory dataset {self._ds_id!r} at level {index} " f"from function {self._callable_name!r}: " f"expected an xarray.Dataset but got {type(computed_value)}") return assert_cube(computed_value, name=self._ds_id)
def open_ml_dataset_from_object_storage(path: str, data_format: str = None, ds_id: str = None, exception_type: type = ValueError, s3_kwargs: Mapping[str, Any] = None, s3_client_kwargs: Mapping[str, Any] = None, chunk_cache_capacity: int = None, **kwargs) -> MultiLevelDataset: data_format = data_format or guess_ml_dataset_format(path) s3, root = parse_s3_fs_and_root(path, s3_kwargs=s3_kwargs, s3_client_kwargs=s3_client_kwargs, mode='r') if data_format == FORMAT_NAME_ZARR: store = s3fs.S3Map(root=root, s3=s3, check=False) if chunk_cache_capacity: store = zarr.LRUStoreCache(store, max_size=chunk_cache_capacity) with measure_time(tag=f"opened remote zarr dataset {path}"): consolidated = s3.exists(f'{root}/.zmetadata') ds = assert_cube( xr.open_zarr(store, consolidated=consolidated, **kwargs)) return BaseMultiLevelDataset(ds, ds_id=ds_id) elif data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened remote levels dataset {path}"): return ObjectStorageMultiLevelDataset( s3, root, zarr_kwargs=kwargs, ds_id=ds_id, chunk_cache_capacity=chunk_cache_capacity, exception_type=exception_type) raise exception_type( f'Unrecognized multi-level dataset format {data_format!r} for path {path!r}' )
def open_ml_dataset_from_object_storage( ctx: ServiceContext, dataset_descriptor: DatasetDescriptor) -> MultiLevelDataset: ds_id = dataset_descriptor.get('Identifier') path = dataset_descriptor.get('Path') if not path: raise ServiceConfigError( f"Missing 'path' entry in dataset descriptor {ds_id}") data_format = dataset_descriptor.get('Format', FORMAT_NAME_ZARR) s3_client_kwargs = {} if 'Endpoint' in dataset_descriptor: s3_client_kwargs['endpoint_url'] = dataset_descriptor['Endpoint'] if 'Region' in dataset_descriptor: s3_client_kwargs['region_name'] = dataset_descriptor['Region'] obs_file_system = s3fs.S3FileSystem(anon=True, client_kwargs=s3_client_kwargs) if data_format == FORMAT_NAME_ZARR: store = s3fs.S3Map(root=path, s3=obs_file_system, check=False) cached_store = zarr.LRUStoreCache(store, max_size=2**28) with measure_time(tag=f"opened remote zarr dataset {path}"): consolidated = obs_file_system.exists(f'{path}/.zmetadata') ds = assert_cube( xr.open_zarr(cached_store, consolidated=consolidated)) return BaseMultiLevelDataset(ds) if data_format == FORMAT_NAME_LEVELS: with measure_time(tag=f"opened remote levels dataset {path}"): return ObjectStorageMultiLevelDataset( ds_id, obs_file_system, path, exception_type=ServiceConfigError)
def get_cube_values_for_indexes( cube: xr.Dataset, indexes: Union[xr.Dataset, pd.DataFrame, Mapping[str, Any]], include_coords: bool = False, include_bounds: bool = False, data_var_names: Sequence[str] = None, index_name_pattern: str = DEFAULT_INDEX_NAME_PATTERN, method: str = DEFAULT_INTERP_POINT_METHOD, cube_asserted: bool = False) -> xr.Dataset: """ Get values from the *cube* at given *indexes*. :param cube: A cube dataset. :param indexes: A mapping from column names to index and fraction arrays for all cube dimensions. :param include_coords: Whether to include the cube coordinates for each point in return value. :param include_bounds: Whether to include the cube coordinate boundaries (if any) for each point in return value. :param data_var_names: An optional list of names of data variables in *cube* whose values shall be extracted. :param index_name_pattern: A naming pattern for the computed indexes columns. Must include "{name}" which will be replaced by the dimension name. :param method: "nearest" or "linear". :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new data frame whose columns are values from *cube* variables at given *indexes*. """ if not cube_asserted: assert_cube(cube) if method not in {POINT_INTERP_METHOD_NEAREST, POINT_INTERP_METHOD_LINEAR}: raise ValueError(f"invalid method {method!r}") if method != POINT_INTERP_METHOD_NEAREST: raise NotImplementedError(f"method {method!r} not yet implemented") all_data_var_names = tuple(cube.data_vars.keys()) if len(all_data_var_names) == 0: raise ValueError("cube is empty") if data_var_names is not None: if len(data_var_names) == 0: return xr.Dataset( coords=indexes.coords if hasattr(indexes, "coords") else None) for var_name in data_var_names: if var_name not in cube.data_vars: raise ValueError(f"variable {var_name!r} not found in cube") else: data_var_names = all_data_var_names dim_names = cube[data_var_names[0]].dims num_dims = len(dim_names) index_names = [ index_name_pattern.format(name=dim_name) for dim_name in dim_names ] num_points = _validate_points(indexes, index_names, param_name="indexes") indexes = _normalize_points(indexes) cube = xr.Dataset( {var_name: cube[var_name] for var_name in data_var_names}, coords=cube.coords) new_bounds_vars = {} bounds_var_names = _get_coord_bounds_var_names(cube) drop_coords = None if bounds_var_names: if include_bounds: # Flatten any coordinate bounds variables for var_name, bnds_var_name in bounds_var_names.items(): bnds_var = cube[bnds_var_name] new_bounds_vars[f"{var_name}_lower"] = bnds_var[:, 0] new_bounds_vars[f"{var_name}_upper"] = bnds_var[:, 1] cube = cube.assign_coords(**new_bounds_vars) cube = cube.drop_vars(bounds_var_names.values()) if not include_coords: drop_coords = set(cube.coords).difference(new_bounds_vars.keys()) else: if not include_coords: drop_coords = set(cube.coords) # Generate a validation condition so we can filter out invalid rows (where any index == -1) is_valid_point = None for index_name in index_names: col = indexes[index_name] condition = col >= 0 if np.issubdtype(col.dtype, np.integer) else np.isnan(col) if is_valid_point is None: is_valid_point = condition else: is_valid_point = np.logical_and(is_valid_point, condition) num_valid_points = np.count_nonzero(is_valid_point) if num_valid_points == num_points: # All indexes valid cube_selector = { dim_names[i]: indexes[index_names[i]] for i in range(num_dims) } cube_values = cube.isel(cube_selector) elif num_valid_points == 0: # All indexes are invalid new_bounds_vars = {} for var_name in cube.variables: new_bounds_vars[var_name] = _empty_points_var( cube[var_name], num_points) cube_values = xr.Dataset(new_bounds_vars) else: # Some invalid indexes idx = np.arange(num_points) good_idx = idx[is_valid_point.values] idx_dim_name = indexes[index_names[0]].dims[0] good_indexes = indexes.isel({idx_dim_name: good_idx}) cube_selector = { dim_names[i]: good_indexes[index_names[i]] for i in range(num_dims) } cube_values = cube.isel(cube_selector) new_bounds_vars = {} for var_name in cube.variables: var = cube_values[var_name] new_var = _empty_points_var(var, num_points) new_var[good_idx] = var new_bounds_vars[var_name] = new_var cube_values = xr.Dataset(new_bounds_vars) if drop_coords: cube_values = cube_values.drop_vars(drop_coords) return cube_values
def test_assert_cube_without_bounds(self): cube = new_cube(variables=dict(precipitation=0.5), drop_bounds=True) self.assertIs(cube, assert_cube(cube))
def test_assert_cube_ok(self): cube = new_cube(variables=dict(precipitation=0.5)) self.assertIs(cube, assert_cube(cube))
def get_time_series(cube: xr.Dataset, geometry: GeometryLike = None, var_names: Sequence[str] = None, start_date: Date = None, end_date: Date = None, include_count: bool = False, include_stdev: bool = False, use_groupby: bool = False, cube_asserted: bool = False) -> Optional[xr.Dataset]: """ Get a time series dataset from a data *cube*. *geometry* may be provided as a (shapely) geometry object, a valid GeoJSON object, a valid WKT string, a sequence of box coordinates (x1, y1, x2, y2), or point coordinates (x, y). If *geometry* covers an area, i.e. is not a point, the function aggregates the variables to compute a mean value and if desired, the number of valid observations and the standard deviation. *start_date* and *end_date* may be provided as a numpy.datetime64 or an ISO datetime string. Returns a time-series dataset whose data variables have a time dimension but no longer have spatial dimensions, hence the resulting dataset's variables will only have N-2 dimensions. A global attribute ``max_number_of_observations`` will be set to the maximum number of observations that could have been made in each time step. If the given *geometry* does not overlap the cube's boundaries, or if not output variables remain, the function returns ``None``. :param cube: The xcube dataset :param geometry: Optional geometry :param var_names: Optional sequence of names of variables to be included. :param start_date: Optional start date. :param end_date: Optional end date. :param include_count: Whether to include the number of valid observations for each time step. Ignored if geometry is a point. :param include_stdev: Whether to include standard deviation for each time step. Ignored if geometry is a point. :param use_groupby: Use group-by operation. May increase or decrease runtime performance and/or memory consumption. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new dataset with time-series for each variable. """ if not cube_asserted: assert_cube(cube) geometry = convert_geometry(geometry) dataset = select_variables_subset(cube, var_names) if len(dataset.data_vars) == 0: return None if start_date is not None or end_date is not None: # noinspection PyTypeChecker dataset = dataset.sel(time=slice(start_date, end_date)) if isinstance(geometry, shapely.geometry.Point): bounds = get_dataset_geometry(dataset) if not bounds.contains(geometry): return None dataset = dataset.sel(lon=geometry.x, lat=geometry.y, method='Nearest') return dataset.assign_attrs(max_number_of_observations=1) if geometry is not None: dataset = mask_dataset_by_geometry(dataset, geometry, save_geometry_mask='__mask__') if dataset is None: return None mask = dataset['__mask__'] max_number_of_observations = np.count_nonzero(mask) dataset = dataset.drop('__mask__') else: max_number_of_observations = dataset.lat.size * dataset.lon.size ds_count = None ds_stdev = None if use_groupby: time_group = dataset.groupby('time') ds_mean = time_group.mean(skipna=True, dim=xr.ALL_DIMS) if include_count: ds_count = time_group.count(dim=xr.ALL_DIMS) if include_stdev: ds_stdev = time_group.std(skipna=True, dim=xr.ALL_DIMS) else: ds_mean = dataset.mean(dim=('lat', 'lon'), skipna=True) if include_count: ds_count = dataset.count(dim=('lat', 'lon')) if include_stdev: ds_stdev = dataset.std(dim=('lat', 'lon'), skipna=True) if ds_count is not None: ds_count = ds_count.rename( name_dict=dict({v: f"{v}_count" for v in ds_count.data_vars})) if ds_stdev is not None: ds_stdev = ds_stdev.rename( name_dict=dict({v: f"{v}_stdev" for v in ds_stdev.data_vars})) if ds_count is not None and ds_stdev is not None: ts_dataset = xr.merge([ds_mean, ds_stdev, ds_count]) elif ds_count is not None: ts_dataset = xr.merge([ds_mean, ds_count]) elif ds_stdev is not None: ts_dataset = xr.merge([ds_mean, ds_stdev]) else: ts_dataset = ds_mean ts_dataset = ts_dataset.assign_attrs( max_number_of_observations=max_number_of_observations) return ts_dataset
def resample_in_time(dataset: xr.Dataset, frequency: str, method: Union[str, Sequence[str]], offset=None, base: int = 0, tolerance=None, interp_kind=None, time_chunk_size=None, var_names: Sequence[str] = None, metadata: Dict[str, Any] = None, cube_asserted: bool = False) -> xr.Dataset: """ Resample a dataset in the time dimension. The argument *method* may be one or a sequence of ``'all'``, ``'any'``, ``'argmax'``, ``'argmin'``, ``'count'``, ``'first'``, ``'last'``, ``'max'``, ``'min'``, ``'mean'``, ``'median'``, ``'percentile_<p>'``, ``'std'``, ``'sum'``, ``'var'``. In value ``'percentile_<p>'`` is a placeholder, where ``'<p>'`` must be replaced by an integer percentage value, e.g. ``'percentile_90'`` is the 90%-percentile. *Important note:* As of xarray 0.14 and dask 2.8, the methods ``'median'`` and ``'percentile_<p>'` cannot be used if the variables in *cube* comprise chunked dask arrays. In this case, use the ``compute()`` or ``load()`` method to convert dask arrays into numpy arrays. :param dataset: The xcube dataset. :param frequency: Temporal aggregation frequency. Use format "<count><offset>" where <offset> is one of 'H', 'D', 'W', 'M', 'Q', 'Y'. :param method: Resampling method or sequence of resampling methods. :param offset: Offset used to adjust the resampled time labels. Uses same syntax as *frequency*. :param base: For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '24H' frequency, base could range from 0 through 23. :param time_chunk_size: If not None, the chunk size to be used for the "time" dimension. :param var_names: Variable names to include. :param tolerance: Time tolerance for selective upsampling methods. Defaults to *frequency*. :param interp_kind: Kind of interpolation if *method* is 'interpolation'. :param metadata: Output metadata. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new xcube dataset resampled in time. """ if not cube_asserted: assert_cube(dataset) if frequency == 'all': time_gap = np.array(dataset.time[-1]) - np.array(dataset.time[0]) days = int((np.timedelta64(time_gap, 'D') / np.timedelta64(1, 'D')) + 1) frequency = f'{days}D' if var_names: dataset = select_variables_subset(dataset, var_names) resampler = dataset.resample(skipna=True, closed='left', label='left', time=frequency, loffset=offset, base=base) if isinstance(method, str): methods = [method] else: methods = list(method) percentile_prefix = 'percentile_' resampled_cubes = [] for method in methods: method_args = [] method_postfix = method if method.startswith(percentile_prefix): p = int(method[len(percentile_prefix):]) q = p / 100.0 method_args = [q] method_postfix = f'p{p}' method = 'quantile' resampling_method = getattr(resampler, method) method_kwargs = get_method_kwargs(method, frequency, interp_kind, tolerance) resampled_cube = resampling_method(*method_args, **method_kwargs) resampled_cube = resampled_cube.rename({ var_name: f'{var_name}_{method_postfix}' for var_name in resampled_cube.data_vars }) resampled_cubes.append(resampled_cube) if len(resampled_cubes) == 1: resampled_cube = resampled_cubes[0] else: resampled_cube = xr.merge(resampled_cubes) # TODO: add time_bnds to resampled_ds time_coverage_start = '%s' % dataset.time[0] time_coverage_end = '%s' % dataset.time[-1] resampled_cube.attrs.update(metadata or {}) # TODO: add other time_coverage_ attributes resampled_cube.attrs.update(time_coverage_start=time_coverage_start, time_coverage_end=time_coverage_end) schema = CubeSchema.new(dataset) chunk_sizes = { schema.dims[i]: schema.chunks[i] for i in range(schema.ndim) } if isinstance(time_chunk_size, int) and time_chunk_size >= 0: chunk_sizes['time'] = time_chunk_size return resampled_cube.chunk(chunk_sizes)