def test_select_variables_subset_all(self): ds1 = create_highroc_dataset() # noinspection PyTypeChecker ds2 = select_variables_subset(ds1, None) self.assertIs(ds2, ds1) ds2 = select_variables_subset(ds1, ds1.data_vars.keys()) self.assertIs(ds2, ds1)
def transform_cube(self, cube: xr.Dataset, gm: GridMapping, cube_config: CubeConfig) -> TransformedCube: desired_var_names = cube_config.variable_names if desired_var_names: cube = select_variables_subset(cube, var_names=desired_var_names) cube_config = cube_config.drop_props('variable_names') desired_bbox = cube_config.bbox if desired_bbox is not None: # Find out whether its possible to make a spatial subset # without resampling. First, grid mapping must be regular. can_do_spatial_subset = False if gm.is_regular: can_do_spatial_subset = True # Current spatial resolution must be the # desired spatial resolution, otherwise spatial resampling # is required later, which will include the desired # subsetting. desired_res = cube_config.spatial_res if desired_res is not None \ and not (math.isclose(gm.x_res, desired_res) and math.isclose(gm.y_res, desired_res)): can_do_spatial_subset = False if can_do_spatial_subset: # Finally, the desired CRS must be equal to the current # one, or they must both be geographic. desired_crs = cube_config.crs if desired_crs: desired_crs = pyproj.CRS.from_string(desired_crs) if desired_crs != gm.crs \ and not (desired_crs.is_geographic and gm.crs.is_geographic): can_do_spatial_subset = False if can_do_spatial_subset: cube = select_spatial_subset(cube, xy_bbox=desired_bbox) # Now that we have a new cube subset, we must adjust # its grid mapping. gm = GridMapping.from_dataset( cube, crs=gm.crs, xy_var_names=gm.xy_var_names, ) # Consume spatial properties cube_config = cube_config.drop_props(['bbox', 'spatial_res', 'crs']) desired_time_range = cube_config.time_range if desired_time_range: cube = select_temporal_subset(cube, time_range=desired_time_range) cube_config = cube_config.drop_props('time_range') return cube, gm, cube_config
def select_variables_subset(self, var_names: Sequence[str] = None): """ Select data variable from given *dataset* and create new dataset. :param var_names: The names of data variables to select. :return: A new dataset. It is empty, if *var_names* is empty. It is *dataset*, if *var_names* is None. """ return select_variables_subset(self._dataset, var_names)
def transform(ds: xr.Dataset) -> xr.Dataset: if variables: ds = select_variables_subset(ds, var_names=variables) if indexers: ds = ds.sel(**indexers) chunk_sizes = {dim: 1 for dim in ds.dims} chunk_sizes[spatial_dims[0]] = tile_width chunk_sizes[spatial_dims[1]] = tile_height return ds.chunk(chunk_sizes)
def step3(input_slice): extra_vars = input_processor.get_extra_vars(input_slice) selected_variables = set( [var_name for var_name, _ in output_variables]) selected_variables.update(extra_vars or set()) return select_variables_subset(input_slice, selected_variables)
def test_select_variables_subset_some(self): ds1 = create_highroc_dataset() self.assertEqual(36, len(ds1.data_vars)) ds2 = select_variables_subset(ds1, ['conc_chl', 'c2rcc_flags', 'rtoa_10']) self.assertEqual(3, len(ds2.data_vars))
def test_select_variables_subset_none(self): ds1 = create_highroc_dataset() ds2 = select_variables_subset(ds1, []) self.assertEqual(0, len(ds2.data_vars)) ds2 = select_variables_subset(ds1, ['bibo']) self.assertEqual(0, len(ds2.data_vars))
def get_time_series(cube: xr.Dataset, geometry: GeometryLike = None, var_names: Sequence[str] = None, start_date: Date = None, end_date: Date = None, include_count: bool = False, include_stdev: bool = False, use_groupby: bool = False, cube_asserted: bool = False) -> Optional[xr.Dataset]: """ Get a time series dataset from a data *cube*. *geometry* may be provided as a (shapely) geometry object, a valid GeoJSON object, a valid WKT string, a sequence of box coordinates (x1, y1, x2, y2), or point coordinates (x, y). If *geometry* covers an area, i.e. is not a point, the function aggregates the variables to compute a mean value and if desired, the number of valid observations and the standard deviation. *start_date* and *end_date* may be provided as a numpy.datetime64 or an ISO datetime string. Returns a time-series dataset whose data variables have a time dimension but no longer have spatial dimensions, hence the resulting dataset's variables will only have N-2 dimensions. A global attribute ``max_number_of_observations`` will be set to the maximum number of observations that could have been made in each time step. If the given *geometry* does not overlap the cube's boundaries, or if not output variables remain, the function returns ``None``. :param cube: The xcube dataset :param geometry: Optional geometry :param var_names: Optional sequence of names of variables to be included. :param start_date: Optional start date. :param end_date: Optional end date. :param include_count: Whether to include the number of valid observations for each time step. Ignored if geometry is a point. :param include_stdev: Whether to include standard deviation for each time step. Ignored if geometry is a point. :param use_groupby: Use group-by operation. May increase or decrease runtime performance and/or memory consumption. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new dataset with time-series for each variable. """ if not cube_asserted: assert_cube(cube) geometry = convert_geometry(geometry) dataset = select_variables_subset(cube, var_names) if len(dataset.data_vars) == 0: return None if start_date is not None or end_date is not None: # noinspection PyTypeChecker dataset = dataset.sel(time=slice(start_date, end_date)) if isinstance(geometry, shapely.geometry.Point): bounds = get_dataset_geometry(dataset) if not bounds.contains(geometry): return None dataset = dataset.sel(lon=geometry.x, lat=geometry.y, method='Nearest') return dataset.assign_attrs(max_number_of_observations=1) if geometry is not None: dataset = mask_dataset_by_geometry(dataset, geometry, save_geometry_mask='__mask__') if dataset is None: return None mask = dataset['__mask__'] max_number_of_observations = np.count_nonzero(mask) dataset = dataset.drop('__mask__') else: max_number_of_observations = dataset.lat.size * dataset.lon.size ds_count = None ds_stdev = None if use_groupby: time_group = dataset.groupby('time') ds_mean = time_group.mean(skipna=True, dim=xr.ALL_DIMS) if include_count: ds_count = time_group.count(dim=xr.ALL_DIMS) if include_stdev: ds_stdev = time_group.std(skipna=True, dim=xr.ALL_DIMS) else: ds_mean = dataset.mean(dim=('lat', 'lon'), skipna=True) if include_count: ds_count = dataset.count(dim=('lat', 'lon')) if include_stdev: ds_stdev = dataset.std(dim=('lat', 'lon'), skipna=True) if ds_count is not None: ds_count = ds_count.rename( name_dict=dict({v: f"{v}_count" for v in ds_count.data_vars})) if ds_stdev is not None: ds_stdev = ds_stdev.rename( name_dict=dict({v: f"{v}_stdev" for v in ds_stdev.data_vars})) if ds_count is not None and ds_stdev is not None: ts_dataset = xr.merge([ds_mean, ds_stdev, ds_count]) elif ds_count is not None: ts_dataset = xr.merge([ds_mean, ds_count]) elif ds_stdev is not None: ts_dataset = xr.merge([ds_mean, ds_stdev]) else: ts_dataset = ds_mean ts_dataset = ts_dataset.assign_attrs( max_number_of_observations=max_number_of_observations) return ts_dataset
def resample_in_time(dataset: xr.Dataset, frequency: str, method: Union[str, Sequence[str]], offset=None, base: int = 0, tolerance=None, interp_kind=None, time_chunk_size=None, var_names: Sequence[str] = None, metadata: Dict[str, Any] = None, cube_asserted: bool = False) -> xr.Dataset: """ Resample a dataset in the time dimension. The argument *method* may be one or a sequence of ``'all'``, ``'any'``, ``'argmax'``, ``'argmin'``, ``'count'``, ``'first'``, ``'last'``, ``'max'``, ``'min'``, ``'mean'``, ``'median'``, ``'percentile_<p>'``, ``'std'``, ``'sum'``, ``'var'``. In value ``'percentile_<p>'`` is a placeholder, where ``'<p>'`` must be replaced by an integer percentage value, e.g. ``'percentile_90'`` is the 90%-percentile. *Important note:* As of xarray 0.14 and dask 2.8, the methods ``'median'`` and ``'percentile_<p>'` cannot be used if the variables in *cube* comprise chunked dask arrays. In this case, use the ``compute()`` or ``load()`` method to convert dask arrays into numpy arrays. :param dataset: The xcube dataset. :param frequency: Temporal aggregation frequency. Use format "<count><offset>" where <offset> is one of 'H', 'D', 'W', 'M', 'Q', 'Y'. :param method: Resampling method or sequence of resampling methods. :param offset: Offset used to adjust the resampled time labels. Uses same syntax as *frequency*. :param base: For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '24H' frequency, base could range from 0 through 23. :param time_chunk_size: If not None, the chunk size to be used for the "time" dimension. :param var_names: Variable names to include. :param tolerance: Time tolerance for selective upsampling methods. Defaults to *frequency*. :param interp_kind: Kind of interpolation if *method* is 'interpolation'. :param metadata: Output metadata. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new xcube dataset resampled in time. """ if not cube_asserted: assert_cube(dataset) if frequency == 'all': time_gap = np.array(dataset.time[-1]) - np.array(dataset.time[0]) days = int((np.timedelta64(time_gap, 'D') / np.timedelta64(1, 'D')) + 1) frequency = f'{days}D' if var_names: dataset = select_variables_subset(dataset, var_names) resampler = dataset.resample(skipna=True, closed='left', label='left', time=frequency, loffset=offset, base=base) if isinstance(method, str): methods = [method] else: methods = list(method) percentile_prefix = 'percentile_' resampled_cubes = [] for method in methods: method_args = [] method_postfix = method if method.startswith(percentile_prefix): p = int(method[len(percentile_prefix):]) q = p / 100.0 method_args = [q] method_postfix = f'p{p}' method = 'quantile' resampling_method = getattr(resampler, method) method_kwargs = get_method_kwargs(method, frequency, interp_kind, tolerance) resampled_cube = resampling_method(*method_args, **method_kwargs) resampled_cube = resampled_cube.rename({ var_name: f'{var_name}_{method_postfix}' for var_name in resampled_cube.data_vars }) resampled_cubes.append(resampled_cube) if len(resampled_cubes) == 1: resampled_cube = resampled_cubes[0] else: resampled_cube = xr.merge(resampled_cubes) # TODO: add time_bnds to resampled_ds time_coverage_start = '%s' % dataset.time[0] time_coverage_end = '%s' % dataset.time[-1] resampled_cube.attrs.update(metadata or {}) # TODO: add other time_coverage_ attributes resampled_cube.attrs.update(time_coverage_start=time_coverage_start, time_coverage_end=time_coverage_end) schema = CubeSchema.new(dataset) chunk_sizes = { schema.dims[i]: schema.chunks[i] for i in range(schema.ndim) } if isinstance(time_chunk_size, int) and time_chunk_size >= 0: chunk_sizes['time'] = time_chunk_size return resampled_cube.chunk(chunk_sizes)
def get_time_series(cube: xr.Dataset, geometry: GeometryLike = None, var_names: Sequence[str] = None, start_date: Date = None, end_date: Date = None, agg_methods: Union[str, Sequence[str], AbstractSet[str]] = AGG_MEAN, include_count: bool = False, include_stdev: bool = False, use_groupby: bool = False, cube_asserted: bool = False) -> Optional[xr.Dataset]: """ Get a time series dataset from a data *cube*. *geometry* may be provided as a (shapely) geometry object, a valid GeoJSON object, a valid WKT string, a sequence of box coordinates (x1, y1, x2, y2), or point coordinates (x, y). If *geometry* covers an area, i.e. is not a point, the function aggregates the variables to compute a mean value and if desired, the number of valid observations and the standard deviation. *start_date* and *end_date* may be provided as a numpy.datetime64 or an ISO datetime string. Returns a time-series dataset whose data variables have a time dimension but no longer have spatial dimensions, hence the resulting dataset's variables will only have N-2 dimensions. A global attribute ``max_number_of_observations`` will be set to the maximum number of observations that could have been made in each time step. If the given *geometry* does not overlap the cube's boundaries, or if not output variables remain, the function returns ``None``. :param cube: The xcube dataset :param geometry: Optional geometry :param var_names: Optional sequence of names of variables to be included. :param start_date: Optional start date. :param end_date: Optional end date. :param agg_methods: Aggregation methods. May be single string or sequence of strings. Possible values are 'mean', 'median', 'min', 'max', 'std', 'count'. Defaults to 'mean'. Ignored if geometry is a point. :param include_count: Deprecated. Whether to include the number of valid observations for each time step. Ignored if geometry is a point. :param include_stdev: Deprecated. Whether to include standard deviation for each time step. Ignored if geometry is a point. :param use_groupby: Use group-by operation. May increase or decrease runtime performance and/or memory consumption. :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube. :return: A new dataset with time-series for each variable. """ if not cube_asserted: assert_cube(cube) geometry = convert_geometry(geometry) agg_methods = normalize_agg_methods(agg_methods) if include_count: warnings.warn("keyword argument 'include_count' has been deprecated, " f"use 'agg_methods=[{AGG_COUNT!r}, ...]' instead") agg_methods.add(AGG_COUNT) if include_stdev: warnings.warn("keyword argument 'include_stdev' has been deprecated, " f"use 'agg_methods=[{AGG_STD!r}, ...]' instead") agg_methods.add(AGG_STD) dataset = select_variables_subset(cube, var_names) if len(dataset.data_vars) == 0: return None if start_date is not None or end_date is not None: # noinspection PyTypeChecker dataset = dataset.sel(time=slice(start_date, end_date)) if isinstance(geometry, shapely.geometry.Point): bounds = get_dataset_geometry(dataset) if not bounds.contains(geometry): return None dataset = dataset.sel(lon=geometry.x, lat=geometry.y, method='Nearest') return dataset.assign_attrs(max_number_of_observations=1) if geometry is not None: dataset = mask_dataset_by_geometry(dataset, geometry, save_geometry_mask='__mask__') if dataset is None: return None mask = dataset['__mask__'] max_number_of_observations = np.count_nonzero(mask) dataset = dataset.drop_vars(['__mask__']) else: max_number_of_observations = dataset.lat.size * dataset.lon.size must_load = len(agg_methods) > 1 or any( AGG_METHODS[agg_method] == MUST_LOAD for agg_method in agg_methods) if must_load: dataset.load() agg_datasets = [] if use_groupby: time_group = dataset.groupby('time') for agg_method in agg_methods: method = getattr(time_group, agg_method) if agg_method == 'count': agg_dataset = method(dim=xr.ALL_DIMS) else: agg_dataset = method(dim=xr.ALL_DIMS, skipna=True) agg_datasets.append(agg_dataset) else: for agg_method in agg_methods: method = getattr(dataset, agg_method) if agg_method == 'count': agg_dataset = method(dim=('lat', 'lon')) else: agg_dataset = method(dim=('lat', 'lon'), skipna=True) agg_datasets.append(agg_dataset) agg_datasets = [ agg_dataset.rename(name_dict=dict( {v: f"{v}_{agg_method}" for v in agg_dataset.data_vars})) for agg_method, agg_dataset in zip(agg_methods, agg_datasets) ] ts_dataset = xr.merge(agg_datasets) ts_dataset = ts_dataset.assign_attrs( max_number_of_observations=max_number_of_observations) return ts_dataset