Пример #1
0
    def test_resample_f_all(self):
        resampled_cube = resample_in_time(self.input_cube, 'all', ['min', 'max'])
        self.assertIsNot(resampled_cube, self.input_cube)
        self.assertIn('time', resampled_cube)
        self.assertIn('temperature_min', resampled_cube)
        self.assertIn('temperature_max', resampled_cube)
        self.assertIn('precipitation_min', resampled_cube)
        self.assertIn('precipitation_max', resampled_cube)
        self.assertEqual(('time',), resampled_cube.time.dims)
        self.assertEqual(('time', 'lat', 'lon'), resampled_cube.temperature_min.dims)
        self.assertEqual(('time', 'lat', 'lon'), resampled_cube.temperature_max.dims)
        self.assertEqual(('time', 'lat', 'lon'), resampled_cube.precipitation_min.dims)
        self.assertEqual(('time', 'lat', 'lon'), resampled_cube.precipitation_max.dims)
        self.assertEqual((1,), resampled_cube.time.shape)
        self.assertEqual((1, 180, 360), resampled_cube.temperature_min.shape)
        self.assertEqual((1, 180, 360), resampled_cube.temperature_max.shape)
        self.assertEqual((1, 180, 360), resampled_cube.precipitation_min.shape)
        self.assertEqual((1, 180, 360), resampled_cube.precipitation_max.shape)
        np.testing.assert_allclose(resampled_cube.temperature_min.values[..., 0, 0],
                                   np.array([272.0]))
        np.testing.assert_allclose(resampled_cube.temperature_max.values[..., 0, 0],
                                   np.array([274.9]))
        np.testing.assert_allclose(resampled_cube.precipitation_min.values[..., 0, 0],
                                   np.array([114.2]))
        np.testing.assert_allclose(resampled_cube.precipitation_max.values[..., 0, 0],
                                   np.array([120.0]))

        schema = CubeSchema.new(resampled_cube)
        self.assertEqual(3, schema.ndim)
        self.assertEqual(('time', 'lat', 'lon'), schema.dims)
        self.assertEqual((1, 180, 360), schema.shape)
Пример #2
0
 def test_resample_in_time_with_time_chunk_size(self):
     resampled_cube = resample_in_time(self.input_cube, '2D', ['min', 'max'], time_chunk_size=5)
     schema = CubeSchema.new(resampled_cube)
     self.assertEqual(3, schema.ndim)
     self.assertEqual(('time', 'lat', 'lon'), schema.dims)
     self.assertEqual((33, 180, 360), schema.shape)
     self.assertEqual((5, 90, 180), schema.chunks)
Пример #3
0
    def test_without_inputs(self):
        calls = []

        def my_cube_func(
                input_params: Dict[str, Any] = None,
                dim_coords: Dict[str, np.ndarray] = None,
                dim_ranges: Dict[str, Tuple[int,
                                            int]] = None) -> CubeFuncOutput:
            nonlocal calls
            calls.append((input_params, dim_coords, dim_ranges))
            lon_range = dim_ranges['lon']
            lat_range = dim_ranges['lat']
            time_range = dim_ranges['time']
            n_lon = lon_range[1] - lon_range[0]
            n_lat = lat_range[1] - lat_range[0]
            n_time = time_range[1] - time_range[0]
            fill_value = input_params['fill_value']
            return np.full((n_time, n_lat, n_lon),
                           fill_value,
                           dtype=np.float64)

        output_cube = compute_cube(my_cube_func,
                                   input_cube_schema=CubeSchema.new(self.cube),
                                   input_params=dict(fill_value=0.74))

        self.assertIsInstance(output_cube, xr.Dataset)
        self.assertIn('output', output_cube.data_vars)
        output_var = output_cube.output
        self.assertEqual(0, len(calls))
        self.assertEqual(('time', 'lat', 'lon'), output_var.dims)
        self.assertEqual((6, 180, 360), output_var.shape)

        values = output_var.values
        self.assertEqual(2 * 2 * 4, len(calls))
        self.assertEqual((6, 180, 360), values.shape)
        self.assertAlmostEqual(0.74, values[0, 0, 0])
        self.assertAlmostEqual(0.74, values[-1, -1, -1])
Пример #4
0
def compute_dataset(cube_func: CubeFunc,
                    *input_cubes: xr.Dataset,
                    input_cube_schema: CubeSchema = None,
                    input_var_names: Sequence[str] = None,
                    input_params: Dict[str, Any] = None,
                    output_var_name: str = 'output',
                    output_var_dims: AbstractSet[str] = None,
                    output_var_dtype: Any = np.float64,
                    output_var_attrs: Dict[str, Any] = None,
                    vectorize: bool = None,
                    cube_asserted: bool = False) -> xr.Dataset:
    """
    Compute a new output dataset with a single variable named *output_var_name*
    from variables named *input_var_names* contained in zero, one, or more
    input data cubes in *input_cubes* using a cube factory function *cube_func*.

    *cube_func* is called concurrently for each of the chunks of the input variables.
    It is expected to return a chunk block whith is type ``np.ndarray``.

    If *input_cubes* is not empty, *cube_func* receives variables as specified by *input_var_names*.
    If *input_cubes* is empty, *input_var_names* must be empty too, and *input_cube_schema*
    must be given, so that a new cube can be created.

    The full signature of *cube_func* is:::

        def cube_func(*input_vars: np.ndarray,
                      input_params: Dict[str, Any] = None,
                      dim_coords: Dict[str, np.ndarray] = None,
                      dim_ranges: Dict[str, Tuple[int, int]] = None) -> np.ndarray:
            pass

    The arguments are:

    * ``input_vars``: the variables according to the given *input_var_names*;
    * ``input_params``: is this call's *input_params*, a mapping from parameter name to value;
    * ``dim_coords``: a mapping from dimension names to the current chunk's coordinate arrays;
    * ``dim_ranges``: a mapping from dimension names to the current chunk's index ranges.

    Only the ``input_vars`` argument is mandatory. The keyword arguments
    ``input_params``, ``input_params``, ``input_params`` do need to be present at all.

    *output_var_dims* my be given in the case, where ...
    TODO: describe new output_var_dims...

    :param cube_func: The cube factory function.
    :param input_cubes: An optional sequence of input cube datasets, must be provided if *input_cube_schema* is not.
    :param input_cube_schema: An optional input cube schema, must be provided if *input_cubes* is not.
    :param input_var_names: A sequence of variable names
    :param input_params: Optional dictionary with processing parameters passed to *cube_func*.
    :param output_var_name: Optional name of the output variable, defaults to ``'output'``.
    :param output_var_dims: Optional set of names of the output dimensions,
        used in the case *cube_func* reduces dimensions.
    :param output_var_dtype: Optional numpy datatype of the output variable, defaults to ``'float32'``.
    :param output_var_attrs: Optional metadata attributes for the output variable.
    :param vectorize: Whether all *input_cubes* have the same variables which are concatenated and passed as vectors
        to *cube_func*. Not implemented yet.
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new dataset that contains the computed output variable.
    """
    if vectorize is not None:
        # TODO: support vectorize = all cubes have same variables and cube_func
        #       receives variables as vectors (with extra dim)
        raise NotImplementedError('vectorize is not supported yet')

    if not cube_asserted:
        for cube in input_cubes:
            assert_cube(cube)

    # Check compatibility of inputs
    if input_cubes:
        input_cube_schema = CubeSchema.new(input_cubes[0])
        for cube in input_cubes:
            if not cube_asserted:
                assert_cube(cube)
            if cube != input_cubes[0]:
                # noinspection PyUnusedLocal
                other_schema = CubeSchema.new(cube)
                # TODO (forman): broadcast all cubes to same shape, rechunk to same chunks
    elif input_cube_schema is None:
        raise ValueError('input_cube_schema must be given')

    output_var_name = output_var_name or 'output'

    # Collect named input variables, raise if not found
    input_var_names = input_var_names or []
    input_vars = []
    for var_name in input_var_names:
        input_var = None
        for cube in input_cubes:
            if var_name in cube.data_vars:
                input_var = cube[var_name]
                break
        if input_var is None:
            raise ValueError(
                f'variable {var_name!r} not found in any of cubes')
        input_vars.append(input_var)

    # Find out, if cube_func uses any of _PREDEFINED_KEYWORDS
    has_input_params, has_dim_coords, has_dim_ranges = _inspect_cube_func(
        cube_func, input_var_names)

    def cube_func_wrapper(index_chunk, *input_var_chunks):
        nonlocal input_cube_schema, input_var_names, input_params, input_vars
        nonlocal has_input_params, has_dim_coords, has_dim_ranges

        # Note, xarray.apply_ufunc does a test call with empty input arrays,
        # so index_chunk.size == 0 is a valid case
        empty_call = index_chunk.size == 0

        # TODO: when output_var_dims is given, index_chunk must be reordered
        #   as core dimensions are moved to the and of index_chunk and input_var_chunks
        if not empty_call:
            index_chunk = index_chunk.ravel()

        if index_chunk.size < 2 * input_cube_schema.ndim:
            if not empty_call:
                warnings.warn(
                    f"unexpected index_chunk of size {index_chunk.size} received!"
                )
                return None

        dim_ranges = None
        if has_dim_ranges or has_dim_coords:
            dim_ranges = {}
            for i in range(input_cube_schema.ndim):
                dim_name = input_cube_schema.dims[i]
                if not empty_call:
                    start = int(index_chunk[2 * i + 0])
                    end = int(index_chunk[2 * i + 1])
                    dim_ranges[dim_name] = start, end
                else:
                    dim_ranges[dim_name] = ()

        dim_coords = None
        if has_dim_coords:
            dim_coords = {}
            for coord_var_name, coord_var in input_cube_schema.coords.items():
                coord_slices = [slice(None)] * coord_var.ndim
                for i in range(input_cube_schema.ndim):
                    dim_name = input_cube_schema.dims[i]
                    if dim_name in coord_var.dims:
                        j = coord_var.dims.index(dim_name)
                        coord_slices[j] = slice(*dim_ranges[dim_name])
                dim_coords[coord_var_name] = coord_var[tuple(
                    coord_slices)].values

        kwargs = {}
        if has_input_params:
            kwargs['input_params'] = input_params
        if has_dim_ranges:
            kwargs['dim_ranges'] = dim_ranges
        if has_dim_coords:
            kwargs['dim_coords'] = dim_coords

        return cube_func(*input_var_chunks, **kwargs)

    index_var = _gen_index_var(input_cube_schema)

    all_input_vars = [index_var] + input_vars

    input_core_dims = None
    if output_var_dims:
        input_core_dims = []
        has_warned = False
        for i in range(len(all_input_vars)):
            input_var = all_input_vars[i]
            var_core_dims = [
                dim for dim in input_var.dims if dim not in output_var_dims
            ]
            must_rechunk = False
            if var_core_dims and input_var.chunks:
                for var_core_dim in var_core_dims:
                    dim_index = input_var.dims.index(var_core_dim)
                    dim_chunk_size = input_var.chunks[dim_index][0]
                    dim_shape_size = input_var.shape[dim_index]
                    if dim_chunk_size != dim_shape_size:
                        must_rechunk = True
                        break
            if must_rechunk:
                if not has_warned:
                    warnings.warn(
                        f'Input variables must not be chunked in dimension(s): {", ".join(var_core_dims)}.\n'
                        f'Rechunking applies, which may drastically decrease runtime performance '
                        f'and increase memory usage.')
                    has_warned = True
                all_input_vars[i] = input_var.chunk(
                    {var_core_dim: -1
                     for var_core_dim in var_core_dims})
            input_core_dims.append(var_core_dims)

    output_var = xr.apply_ufunc(cube_func_wrapper,
                                *all_input_vars,
                                dask='parallelized',
                                input_core_dims=input_core_dims,
                                output_dtypes=[output_var_dtype])
    if output_var_attrs:
        output_var.attrs.update(output_var_attrs)
    return xr.Dataset({output_var_name: output_var},
                      coords=input_cube_schema.coords)
Пример #5
0
def compute_cube(cube_func: CubeFunc,
                 *input_cubes: xr.Dataset,
                 input_cube_schema: CubeSchema = None,
                 input_var_names: Sequence[str] = None,
                 input_params: Dict[str, Any] = None,
                 output_var_name: str = 'output',
                 output_var_dtype: Any = np.float64,
                 output_var_attrs: Dict[str, Any] = None,
                 vectorize: bool = None,
                 cube_asserted: bool = False) -> xr.Dataset:
    """
    Compute a new output data cube with a single variable named *output_var_name*
    from variables named *input_var_names* contained in zero, one, or more
    input data cubes in *input_cubes* using a cube factory function *cube_func*.

    *cube_func* is called concurrently for each of the chunks of the input variables.
    It is expected to return a chunk block whith is type ``np.ndarray``.

    If *input_cubes* is not empty, *cube_func* receives variables as specified by *input_var_names*.
    If *input_cubes* is empty, *input_var_names* must be empty too, and *input_cube_schema*
    must be given, so that a new cube can be created.

    The full signature of *cube_func* is:::

        def cube_func(*input_vars: np.ndarray,
                      input_params: Dict[str, Any] = None,
                      dim_coords: Dict[str, np.ndarray] = None,
                      dim_ranges: Dict[str, Tuple[int, int]] = None) -> np.ndarray:
            pass

    The arguments are:

    * ``input_vars``: the variables according to the given *input_var_names*;
    * ``input_params``: is this call's *input_params*, a mapping from parameter name to value;
    * ``dim_coords``: a mapping from dimension names to the current chunk's coordinate arrays;
    * ``dim_ranges``: a mapping from dimension names to the current chunk's index ranges.

    Only the ``input_vars`` argument is mandatory. The keyword arguments
    ``input_params``, ``input_params``, ``input_params`` do need to be present at all.

    :param cube_func: The cube factory function.
    :param input_cubes: An optional sequence of input cube datasets, must be provided if *input_cube_schema* is not.
    :param input_cube_schema: An optional input cube schema, must be provided if *input_cubes* is not.
    :param input_var_names: A sequence of variable names
    :param input_params: Optional dictionary with processing parameters passed to *cube_func*.
    :param output_var_name: Optional name of the output variable, defaults to ``'output'``.
    :param output_var_dtype: Optional numpy datatype of the output variable, defaults to ``'float32'``.
    :param output_var_attrs: Optional metadata attributes for the output variable.
    :param vectorize: Whether all *input_cubes* have the same variables which are concatenated and passed as vectors
        to *cube_func*. Not implemented yet.
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new dataset that contains the computed output variable.
    """
    if vectorize is not None:
        raise NotImplementedError('vectorize is not supported yet')

    if not cube_asserted:
        for cube in input_cubes:
            assert_cube(cube)

    if input_cubes:
        input_cube_schema = CubeSchema.new(input_cubes[0])
        for cube in input_cubes:
            if not cube_asserted:
                assert_cube(cube)
            if cube != input_cubes[0]:
                # noinspection PyUnusedLocal
                other_schema = CubeSchema.new(cube)
                # TODO (forman): broadcast all cubes to same shape, rechunk to same chunks
    elif input_cube_schema is None:
        raise ValueError('input_cube_schema must be given')

    if output_var_name is None:
        output_var_name = 'output'

    input_var_names = input_var_names or []
    input_vars = []
    for var_name in input_var_names:
        var = None
        for cube in input_cubes:
            if var_name in cube.data_vars:
                var = cube[var_name]
                break
        if var is None:
            raise ValueError(f'variable {var_name!r} not found in any of cubes')
        input_vars.append(var)

    has_input_params, has_dim_coords, has_dim_ranges = _inspect_cube_func(cube_func, input_var_names)

    def cube_func_wrapper(index_chunk, *input_var_chunks):
        nonlocal input_cube_schema, input_var_names, input_params, input_vars
        nonlocal has_input_params, has_dim_coords, has_dim_ranges

        index_chunk = index_chunk.ravel()

        if index_chunk.size < 2 * input_cube_schema.ndim:
            warnings.warn(f"weird index_chunk of size {index_chunk.size} received!")
            return

        dim_ranges = None
        if has_dim_ranges or has_dim_coords:
            dim_ranges = {}
            for i in range(input_cube_schema.ndim):
                dim_name = input_cube_schema.dims[i]
                start = int(index_chunk[2 * i + 0])
                end = int(index_chunk[2 * i + 1])
                dim_ranges[dim_name] = start, end

        dim_coords = None
        if has_dim_coords:
            dim_coords = {}
            for coord_var_name, coord_var in input_cube_schema.coords.items():
                coord_slices = [slice(None)] * coord_var.ndim
                for i in range(input_cube_schema.ndim):
                    dim_name = input_cube_schema.dims[i]
                    if dim_name in coord_var.dims:
                        j = coord_var.dims.index(dim_name)
                        coord_slices[j] = slice(*dim_ranges[dim_name])
                dim_coords[coord_var_name] = coord_var[tuple(coord_slices)].values

        kwargs = {}
        if has_input_params:
            kwargs['input_params'] = input_params
        if has_dim_ranges:
            kwargs['dim_ranges'] = dim_ranges
        if has_dim_coords:
            kwargs['dim_coords'] = dim_coords

        return cube_func(*input_var_chunks, **kwargs)

    index_var = _gen_index_var(input_cube_schema)

    output_var = xr.apply_ufunc(cube_func_wrapper,
                                index_var,
                                *input_vars,
                                dask='parallelized',
                                output_dtypes=[output_var_dtype])
    if output_var_attrs:
        output_var.attrs.update(output_var_attrs)
    return xr.Dataset({output_var_name: output_var}, coords=input_cube_schema.coords)
Пример #6
0
def resample_in_time(cube: xr.Dataset,
                     frequency: str,
                     method: Union[str, Sequence[str]],
                     offset=None,
                     base: int = 0,
                     tolerance=None,
                     interp_kind=None,
                     time_chunk_size=None,
                     var_names: Sequence[str] = None,
                     metadata: Dict[str, Any] = None,
                     cube_asserted: bool = False) -> xr.Dataset:
    """
    Resample a xcube dataset in the time dimension.

    :param cube: The xcube dataset.
    :param frequency: Temporal aggregation frequency. Use format "<count><offset>"
        "where <offset> is one of 'H', 'D', 'W', 'M', 'Q', 'Y'.
    :param method: Resampling method or sequence of resampling methods.
    :param offset: Offset used to adjust the resampled time labels.
        Uses same syntax as *frequency*.
    :param base: For frequencies that evenly subdivide 1 day, the "origin" of the
        aggregated intervals. For example, for '24H' frequency, base could range from 0 through 23.
    :param time_chunk_size: If not None, the chunk size to be used for the "time" dimension.
    :param var_names: Variable names to include.
    :param tolerance: Time tolerance for selective upsampling methods. Defaults to *frequency*.
    :param interp_kind: Kind of interpolation if *method* is 'interpolation'.
    :param metadata: Output metadata.
    :param cube_asserted: If False, *cube* will be verified, otherwise it is expected to be a valid cube.
    :return: A new xcube dataset resampled in time.
    """
    if not cube_asserted:
        assert_cube(cube)

    if var_names:
        cube = select_vars(cube, var_names)

    resampler = cube.resample(skipna=True,
                              closed='left',
                              label='left',
                              keep_attrs=True,
                              time=frequency,
                              loffset=offset,
                              base=base)

    if isinstance(method, str):
        methods = [method]
    else:
        methods = list(method)

    resampled_cubes = []
    for method in methods:
        resampling_method = getattr(resampler, method)
        kwargs = get_method_kwargs(method, frequency, interp_kind, tolerance)
        resampled_cube = resampling_method(**kwargs)
        resampled_cube = resampled_cube.rename(
            {var_name: f'{var_name}_{method}' for var_name in resampled_cube.data_vars})
        resampled_cubes.append(resampled_cube)

    if len(resampled_cubes) == 1:
        resampled_cube = resampled_cubes[0]
    else:
        resampled_cube = xr.merge(resampled_cubes)

    # TODO: add time_bnds to resampled_ds
    time_coverage_start = '%s' % cube.time[0]
    time_coverage_end = '%s' % cube.time[-1]

    resampled_cube.attrs.update(metadata or {})
    # TODO: add other time_coverage_ attributes
    resampled_cube.attrs.update(time_coverage_start=time_coverage_start,
                                time_coverage_end=time_coverage_end)

    schema = CubeSchema.new(cube)
    chunk_sizes = {schema.dims[i]: schema.chunks[i] for i in range(schema.ndim)}

    if isinstance(time_chunk_size, int) and time_chunk_size >= 0:
        chunk_sizes['time'] = time_chunk_size

    return resampled_cube.chunk(chunk_sizes)