Пример #1
0
def pearson_correlation_scalar(ds_x: DatasetLike.TYPE,
                               ds_y: DatasetLike.TYPE,
                               var_x: VarName.TYPE,
                               var_y: VarName.TYPE,
                               monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two data variables and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if (array_x.dims != array_y.dims):
        raise ValidationError('Both datasets should feature the same'
                              ' dimensionality. Currently provided ds_x[var_x] '
                              f'has {array_x.dims}, provided ds_y[var_y]'
                              f' has {array_y.dims}')

    for dim in array_x.dims:
        if len(array_x[dim]) != len(array_y[dim]):
            raise ValidationError('All dimensions of both provided data variables'
                                  f' must be the same length. Currently {dim} of ds_x[var_x]'
                                  f' has {len(array_x[dim])} values, while'
                                  f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.'
                                  ' You may want to try to coregister the datasets beforehand.')

    n_vals = 1
    for dim in array_x.dims:
        n_vals = n_vals * len(array_x[dim])

    if n_vals < 3:
        raise ValidationError('There should be no less than 3 values in both data variables'
                              f' to perform the correlation. Currently there are {n_vals} values')

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.stack(z=array_x.dims), array_y.stack(z=array_y.dims))

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
Пример #2
0
def pearson_correlation_scalar(
        ds_x: DatasetLike.TYPE,
        ds_y: DatasetLike.TYPE,
        var_x: VarName.TYPE,
        var_y: VarName.TYPE,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two timeseries and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if ((len(array_x.dims) != len(array_y.dims)) and (len(array_x.dims) != 1)):
        raise ValidationError('To calculate simple correlation, both provided'
                              ' datasets should be simple 1d timeseries. To'
                              ' create a map of correlation coefficients, use'
                              ' pearson_correlation operation instead.')

    if len(array_x['time']) != len(array_y['time']):
        raise ValidationError(
            'The length of the time dimension differs between'
            ' the given datasets. Can not perform the calculation'
            ', please review operation documentation.')

    if len(array_x['time']) < 3:
        raise ValidationError(
            'The length of the time dimension should not be less'
            ' than three to run the calculation.')

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.values, array_y.values)

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
Пример #3
0
def merge(ds_1: DatasetLike.TYPE,
          ds_2: DatasetLike.TYPE,
          ds_3: DatasetLike.TYPE = None,
          ds_4: DatasetLike.TYPE = None,
          join: str = 'outer',
          compat: str = 'no_conflicts') -> xr.Dataset:
    """
    Merge up to four datasets to produce a new dataset with combined variables from each input dataset.

    This is a wrapper for the ``xarray.merge()`` function.

    For documentation refer to xarray documentation at
    http://xarray.pydata.org/en/stable/generated/xarray.Dataset.merge.html#xarray.Dataset.merge

    The *compat* argument indicates how to compare variables of the same name for potential conflicts:

    * "broadcast_equals": all values must be equal when variables are broadcast
      against each other to ensure common dimensions.
    * "equals": all values and dimensions must be the same.
    * "identical": all values, dimensions and attributes must be the same.
    * "no_conflicts": only values which are not null in both datasets must be equal.
      The returned dataset then contains the combination of all non-null values.

    :param ds_1: The first input dataset.
    :param ds_2: The second input dataset.
    :param ds_3: An optional 3rd input dataset.
    :param ds_4: An optional 4th input dataset.
    :param join: How to combine objects with different indexes.
    :param compat: How to compare variables of the same name for potential conflicts.
    :return: A new dataset with combined variables from each input dataset.
    """

    ds_1 = DatasetLike.convert(ds_1)
    ds_2 = DatasetLike.convert(ds_2)
    ds_3 = DatasetLike.convert(ds_3)
    ds_4 = DatasetLike.convert(ds_4)

    datasets = []
    for ds in (ds_1, ds_2, ds_3, ds_4):
        if ds is not None:
            included = False
            for ds2 in datasets:
                if ds is ds2:
                    included = True
            if not included:
                datasets.append(ds)

    if len(datasets) == 0:
        raise ValidationError('At least two different datasets must be given')
    elif len(datasets) == 1:
        return datasets[0]
    else:
        return xr.merge(datasets, compat=compat, join=join)
Пример #4
0
def merge(ds_1: DatasetLike.TYPE,
          ds_2: DatasetLike.TYPE,
          ds_3: DatasetLike.TYPE = None,
          ds_4: DatasetLike.TYPE = None,
          join: str = 'outer',
          compat: str = 'no_conflicts') -> xr.Dataset:
    """
    Merge up to four datasets to produce a new dataset with combined variables from each input dataset.

    This is a wrapper for the ``xarray.merge()`` function.

    For documentation refer to xarray documentation at
    http://xarray.pydata.org/en/stable/generated/xarray.Dataset.merge.html#xarray.Dataset.merge

    The *compat* argument indicates how to compare variables of the same name for potential conflicts:

    * "broadcast_equals": all values must be equal when variables are broadcast
      against each other to ensure common dimensions.
    * "equals": all values and dimensions must be the same.
    * "identical": all values, dimensions and attributes must be the same.
    * "no_conflicts": only values which are not null in both datasets must be equal.
      The returned dataset then contains the combination of all non-null values.

    :param ds_1: The first input dataset.
    :param ds_2: The second input dataset.
    :param ds_3: An optional 3rd input dataset.
    :param ds_4: An optional 4th input dataset.
    :param join: How to combine objects with different indexes.
    :param compat: How to compare variables of the same name for potential conflicts.
    :return: A new dataset with combined variables from each input dataset.
    """

    ds_1 = DatasetLike.convert(ds_1)
    ds_2 = DatasetLike.convert(ds_2)
    ds_3 = DatasetLike.convert(ds_3)
    ds_4 = DatasetLike.convert(ds_4)

    datasets = []
    for ds in (ds_1, ds_2, ds_3, ds_4):
        if ds is not None:
            included = False
            for ds2 in datasets:
                if ds is ds2:
                    included = True
            if not included:
                datasets.append(ds)

    if len(datasets) == 0:
        raise ValidationError('At least two different datasets must be given')
    elif len(datasets) == 1:
        return datasets[0]
    else:
        return xr.merge(datasets, compat=compat, join=join)
Пример #5
0
    def test_convert(self):
        self.assertEqual(DatasetLike.convert(None), None)

        data = {'time': ['2000-01-01', '2000-01-02', '2000-01-03'],
                'c1': [4, 5, 6],
                'c2': [6, 7, 8]}
        pd_ds = pd.DataFrame(data=data)
        pd_ds = pd_ds.set_index('time')
        pd_ds.index = pd.to_datetime(pd_ds.index)
        xr_ds = xr.Dataset(data_vars=data)
        self.assertIsInstance(DatasetLike.convert(xr_ds), xr.Dataset)
        self.assertIsInstance(DatasetLike.convert(pd_ds), xr.Dataset)

        with self.assertRaises(ValidationError):
            DatasetLike.convert(42)
Пример #6
0
def select_var(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None) -> xr.Dataset:
    """
    Filter the dataset, by leaving only the desired variables in it. The original dataset
    information, including original coordinates, is preserved.

    :param ds: The dataset or dataframe from which to perform selection.
    :param var: One or more variable names to select and preserve in the dataset. \
    All of these are valid 'var_name' 'var_name1,var_name2,var_name3' ['var_name1', 'var_name2']. \
    One can also use wildcards when doing the selection. E.g., choosing 'var_name*' for selection \
    will select all variables that start with 'var_name'. This can be used to select variables \
    along with their auxiliary variables, to select all uncertainty variables, and so on.
    :return: A filtered dataset
    """
    if not var:
        return ds

    ds = DatasetLike.convert(ds)

    var_names = VarNamesLike.convert(var)
    dropped_var_names = list(ds.data_vars.keys())

    for pattern in var_names:
        keep = fnmatch.filter(dropped_var_names, pattern)
        for name in keep:
            dropped_var_names.remove(name)

    return ds.drop_vars(dropped_var_names)
Пример #7
0
def select_var(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None) -> xr.Dataset:
    """
    Filter the dataset, by leaving only the desired variables in it. The original dataset
    information, including original coordinates, is preserved.

    :param ds: The dataset or dataframe from which to perform selection.
    :param var: One or more variable names to select and preserve in the dataset. \
    All of these are valid 'var_name' 'var_name1,var_name2,var_name3' ['var_name1', 'var_name2']. \
    One can also use wildcards when doing the selection. E.g., choosing 'var_name*' for selection \
    will select all variables that start with 'var_name'. This can be used to select variables \
    along with their auxiliary variables, to select all uncertainty variables, and so on.
    :return: A filtered dataset
    """
    if not var:
        return ds

    ds = DatasetLike.convert(ds)

    var_names = VarNamesLike.convert(var)
    dropped_var_names = list(ds.data_vars.keys())

    for pattern in var_names:
        keep = fnmatch.filter(dropped_var_names, pattern)
        for name in keep:
            dropped_var_names.remove(name)

    return ds.drop(dropped_var_names)
Пример #8
0
    def test_convert(self):
        self.assertEqual(DatasetLike.convert(None), None)

        data = {
            'time': ['2000-01-01', '2000-01-02', '2000-01-03'],
            'c1': [4, 5, 6],
            'c2': [6, 7, 8]
        }
        pd_ds = pd.DataFrame(data=data)
        pd_ds = pd_ds.set_index('time')
        pd_ds.index = pd.to_datetime(pd_ds.index)
        xr_ds = xr.Dataset(data_vars=data)
        self.assertIsInstance(DatasetLike.convert(xr_ds), xr.Dataset)
        self.assertIsInstance(DatasetLike.convert(pd_ds), xr.Dataset)

        with self.assertRaises(ValidationError):
            DatasetLike.convert(42)
Пример #9
0
def extract_point(ds: DatasetLike.TYPE,
                  point: PointLike.TYPE,
                  indexers: DictLike.TYPE = None,
                  tolerance_default: float = 0.01) -> Dict:
    """
    Extract data at the given point location. The returned dict will contain scalar
    values for all variables for which all dimension have been given in ``indexers``.
    For the dimensions *lon* and *lat* a nearest neighbour lookup is performed.
    All other dimensions must mach exact.

    :param ds: Dataset or dataframe to subset
    :param point: Geographic point given by longitude and latitude
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "layer=4".
    :param tolerance_default: The default longitude and latitude tolerance for the nearest neighbour lookup.
           It will only be used, if it is not possible to deduce the resolution of the dataset.
    :return: A dict with the scalar values of all variables and the variable names as keys.
    """
    ds = DatasetLike.convert(ds)
    point = PointLike.convert(point)
    indexers = DictLike.convert(indexers) or {}

    lon_lat_indexers = {'lon': point.x, 'lat': point.y}
    tolerance = _get_tolerance(ds, tolerance_default)

    variable_values = {}
    var_names = sorted(ds.data_vars.keys())
    for var_name in var_names:
        if not var_name.endswith('_bnds'):
            variable = ds.data_vars[var_name]
            effective_indexers = {}
            used_dims = {'lat', 'lon'}
            for dim_name, dim_value in indexers.items():
                if dim_name in variable.dims:
                    effective_indexers[dim_name] = dim_value
                    used_dims.add(dim_name)
            if set(variable.dims) == used_dims:
                try:
                    lon_lat_data = variable.sel(**effective_indexers)
                except KeyError:
                    # if there is no exact match for the "additional" dims, skip this variable
                    continue
                try:
                    point_data = lon_lat_data.sel(method='nearest',
                                                  tolerance=tolerance,
                                                  **lon_lat_indexers)
                except KeyError:
                    # if there is no point within the given tolerance, return an empty dict
                    return {}
                if not variable_values:
                    variable_values['lat'] = float(point_data.lat)
                    variable_values['lon'] = float(point_data.lon)
                value = to_scalar(point_data.values, ndigits=3)
                if value is not UNDEFINED:
                    variable_values[var_name] = value
    return variable_values
Пример #10
0
def ds_arithmetics(ds: DatasetLike.TYPE,
                   op: str,
                   monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do arithmetic operations on the given dataset by providing a list of
    arithmetic operations and the corresponding constant. The operations will
    be applied to the dataset in the order in which they appear in the list.
    For example:
    'log,+5,-2,/3,*2'

    Currently supported arithmetic operations:
    log,log10,log2,log1p,exp,+,-,/,*

    where:
        log - natural logarithm
        log10 - base 10 logarithm
        log2 - base 2 logarithm
        log1p - log(1+x)
        exp - the exponential

    The operations will be applied element-wise to all arrays of the dataset.

    :param ds: The dataset to which to apply arithmetic operations
    :param op: A comma separated list of arithmetic operations to apply
    :param monitor: a progress monitor.
    :return: The dataset with given arithmetic operations applied
    """
    ds = DatasetLike.convert(ds)
    retset = ds
    with monitor.starting('Calculate result', total_work=len(op.split(','))):
        for item in op.split(','):
            with monitor.child(1).observing("Calculate"):
                item = item.strip()
                if item[0] == '+':
                    retset = retset + float(item[1:])
                elif item[0] == '-':
                    retset = retset - float(item[1:])
                elif item[0] == '*':
                    retset = retset * float(item[1:])
                elif item[0] == '/':
                    retset = retset / float(item[1:])
                elif item[:] == 'log':
                    retset = xu.log(retset)
                elif item[:] == 'log10':
                    retset = xu.log10(retset)
                elif item[:] == 'log2':
                    retset = xu.log2(retset)
                elif item[:] == 'log1p':
                    retset = xu.log1p(retset)
                elif item[:] == 'exp':
                    retset = xu.exp(retset)
                else:
                    raise ValueError('Arithmetic operation {} not'
                                     ' implemented.'.format(item[0]))

    return retset
Пример #11
0
def ds_arithmetics(ds: DatasetLike.TYPE,
                   op: str,
                   monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do arithmetic operations on the given dataset by providing a list of
    arithmetic operations and the corresponding constant. The operations will
    be applied to the dataset in the order in which they appear in the list.
    For example:
    'log,+5,-2,/3,*2'

    Currently supported arithmetic operations:
    log,log10,log2,log1p,exp,+,-,/,*

    where:
        log - natural logarithm
        log10 - base 10 logarithm
        log2 - base 2 logarithm
        log1p - log(1+x)
        exp - the exponential

    The operations will be applied element-wise to all arrays of the dataset.

    :param ds: The dataset to which to apply arithmetic operations
    :param op: A comma separated list of arithmetic operations to apply
    :param monitor: a progress monitor.
    :return: The dataset with given arithmetic operations applied
    """
    ds = DatasetLike.convert(ds)
    retset = ds
    with monitor.starting('Calculate result', total_work=len(op.split(','))):
        for item in op.split(','):
            with monitor.child(1).observing("Calculate"):
                item = item.strip()
                if item[0] == '+':
                    retset = retset + float(item[1:])
                elif item[0] == '-':
                    retset = retset - float(item[1:])
                elif item[0] == '*':
                    retset = retset * float(item[1:])
                elif item[0] == '/':
                    retset = retset / float(item[1:])
                elif item[:] == 'log':
                    retset = np.log(retset)
                elif item[:] == 'log10':
                    retset = np.log10(retset)
                elif item[:] == 'log2':
                    retset = np.log2(retset)
                elif item[:] == 'log1p':
                    retset = np.log1p(retset)
                elif item[:] == 'exp':
                    retset = np.exp(retset)
                else:
                    raise ValidationError('Arithmetic operation {} not'
                                          ' implemented.'.format(item[0]))

    return retset
Пример #12
0
def extract_point(ds: DatasetLike.TYPE,
                  point: PointLike.TYPE,
                  indexers: DictLike.TYPE = None,
                  tolerance_default: float = 0.01) -> Dict:
    """
    Extract data at the given point location. The returned dict will contain scalar
    values for all variables for which all dimension have been given in ``indexers``.
    For the dimensions *lon* and *lat* a nearest neighbour lookup is performed.
    All other dimensions must mach exact.

    :param ds: Dataset or dataframe to subset
    :param point: Geographic point given by longitude and latitude
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "layer=4".
    :param tolerance_default: The default longitude and latitude tolerance for the nearest neighbour lookup.
           It will only be used, if it is not possible to deduce the resolution of the dataset.
    :return: A dict with the scalar values of all variables and the variable names as keys.
    """
    ds = DatasetLike.convert(ds)
    point = PointLike.convert(point)
    indexers = DictLike.convert(indexers) or {}

    lon_lat_indexers = {'lon': point.x, 'lat': point.y}
    tolerance = _get_tolerance(ds, tolerance_default)

    variable_values = {}
    var_names = sorted(ds.data_vars.keys())
    for var_name in var_names:
        if not var_name.endswith('_bnds'):
            variable = ds.data_vars[var_name]
            effective_indexers = {}
            used_dims = {'lat', 'lon'}
            for dim_name, dim_value in indexers.items():
                if dim_name in variable.dims:
                    effective_indexers[dim_name] = dim_value
                    used_dims.add(dim_name)
            if set(variable.dims) == used_dims:
                try:
                    lon_lat_data = variable.sel(**effective_indexers)
                except KeyError:
                    # if there is no exact match for the "additional" dims, skip this variable
                    continue
                try:
                    point_data = lon_lat_data.sel(method='nearest', tolerance=tolerance, **lon_lat_indexers)
                except KeyError:
                    # if there is no point within the given tolerance, return an empty dict
                    return {}
                if not variable_values:
                    variable_values['lat'] = float(point_data.lat)
                    variable_values['lon'] = float(point_data.lon)
                value = to_scalar(point_data.values, ndigits=3)
                if value is not UNDEFINED:
                    variable_values[var_name] = value
    return variable_values
Пример #13
0
def subset_temporal(ds: DatasetLike.TYPE,
                    time_range: TimeRangeLike.TYPE) -> xr.Dataset:
    """
    Do a temporal subset of the dataset.

    :param ds: Dataset or dataframe to subset
    :param time_range: Time range to select
    :return: Subset dataset
    """
    ds = DatasetLike.convert(ds)
    time_range = TimeRangeLike.convert(time_range)
    return adjust_temporal_attrs(subset_temporal_impl(ds, time_range))
Пример #14
0
def subset_temporal(ds: DatasetLike.TYPE,
                    time_range: TimeRangeLike.TYPE) -> xr.Dataset:
    """
    Do a temporal subset of the dataset.

    :param ds: Dataset or dataframe to subset
    :param time_range: Time range to select
    :return: Subset dataset
    """
    ds = DatasetLike.convert(ds)
    time_range = TimeRangeLike.convert(time_range)
    return adjust_temporal_attrs(subset_temporal_impl(ds, time_range))
Пример #15
0
def subset_temporal_index(ds: DatasetLike.TYPE, time_ind_min: int,
                          time_ind_max: int) -> xr.Dataset:
    """
    Do a temporal indices based subset

    :param ds: Dataset or dataframe to subset
    :param time_ind_min: Minimum time index to select
    :param time_ind_max: Maximum time index to select
    :return: Subset dataset
    """
    ds = DatasetLike.convert(ds)
    return subset_temporal_index_impl(ds, time_ind_min, time_ind_max)
Пример #16
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Create a 'mean over years' dataset by averaging the values of the given input
    dataset over all years. The output is a climatological dataset with the same
    resolution as the input dataset. E.g. a daily input dataset will create a daily
    climatology consisting of 365 days, a monthly input dataset will create a monthly
    climatology, etc.

    Seasonal input datasets must have matching seasons over all years denoted by the
    same date each year. E.g., first date of each quarter. The output dataset will
    then be a seasonal climatology where each season is denoted with the same date
    as in the input dataset.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Long term average operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    try:
        t_resolution = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution. Running'
            ' the adjust_temporal_attrs operation beforehand may'
            ' help.')

    var = VarNamesLike.convert(var)
    # Shallow

    if var:
        ds = select_var(ds, var)

    if t_resolution == 'P1D':
        return _lta_daily(ds)
    elif t_resolution == 'P1M':
        return _lta_monthly(ds, monitor)
    else:
        return _lta_general(ds, monitor)
Пример #17
0
def subset_temporal_index(ds: DatasetLike.TYPE,
                          time_ind_min: int,
                          time_ind_max: int) -> xr.Dataset:
    """
    Do a temporal indices based subset

    :param ds: Dataset or dataframe to subset
    :param time_ind_min: Minimum time index to select
    :param time_ind_max: Maximum time index to select
    :return: Subset dataset
    """
    ds = DatasetLike.convert(ds)
    return subset_temporal_index_impl(ds, time_ind_min, time_ind_max)
Пример #18
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {
        'min': np.nanmin,
        'max': np.nanmax,
        'mean': np.nanmean,
        'median': np.nanmedian,
        'sum': np.nansum
    }

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [
            value for value in dim if value in retset[var_name].dims
        ]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Пример #19
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Create a 'mean over years' dataset by averaging the values of the given input
    dataset over all years. The output is a climatological dataset with the same
    resolution as the input dataset. E.g. a daily input dataset will create a daily
    climatology consisting of 365 days, a monthly input dataset will create a monthly
    climatology, etc.

    Seasonal input datasets must have matching seasons over all years denoted by the
    same date each year. E.g., first date of each quarter. The output dataset will
    then be a seasonal climatology where each season is denoted with the same date
    as in the input dataset.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError('Long term average operation expects a dataset with the'
                              ' time coordinate of type datetime64[ns], but received'
                              ' {}. Running the normalize operation on this'
                              ' dataset may help'.format(ds.time.dtype))

    try:
        t_resolution = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError('Could not determine temporal resolution. Running'
                              ' the adjust_temporal_attrs operation beforehand may'
                              ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    if t_resolution == 'P1D':
        return _lta_daily(retset, monitor)
    elif t_resolution == 'P1M':
        return _lta_monthly(retset, monitor)
    else:
        return _lta_general(retset, monitor)
Пример #20
0
def plot(ds: DatasetLike.TYPE,
         var: VarName.TYPE,
         indexers: DictLike.TYPE = None,
         title: str = None,
         properties: DictLike.TYPE = None,
         file: str = None) -> Figure:
    """
    Create a 1D/line or 2D/image plot of a variable given by dataset *ds* and variable name *var*.

    :param ds: Dataset or Dataframe that contains the variable named by *var*.
    :param var: The name of the variable to plot
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "lat=12.4, time='2012-05-02'".
    :param title: an optional plot title
    :param properties: optional plot properties for Python matplotlib,
           e.g. "bins=512, range=(-1.5, +1.5), label='Sea Surface Temperature'"
           For full reference refer to
           https://matplotlib.org/api/lines_api.html and
           https://matplotlib.org/devdocs/api/_as_gen/matplotlib.patches.Patch.html#matplotlib.patches.Patch
    :param file: path to a file in which to save the plot
    :return: a matplotlib figure object or None if in IPython mode
    """
    ds = DatasetLike.convert(ds)

    var_name = VarName.convert(var)
    if not var_name:
        raise ValidationError("Missing name for 'var'")
    var = ds[var_name]

    indexers = DictLike.convert(indexers)
    properties = DictLike.convert(properties) or {}

    figure = plt.figure()
    ax = figure.add_subplot(111)

    var_data = get_var_data(var, indexers)
    var_data.plot(ax=ax, **properties)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Пример #21
0
def plot(ds: xr.Dataset,
         var: VarName.TYPE,
         indexers: DictLike.TYPE = None,
         title: str = None,
         properties: DictLike.TYPE = None,
         file: str = None) -> Figure:
    """
    Create a 1D/line or 2D/image plot of a variable given by dataset *ds* and variable name *var*.

    :param ds: Dataset or Dataframe that contains the variable named by *var*.
    :param var: The name of the variable to plot
    :param indexers: Optional indexers into data array of *var*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "lat=12.4, time='2012-05-02'".
    :param title: an optional plot title
    :param properties: optional plot properties for Python matplotlib,
           e.g. "bins=512, range=(-1.5, +1.5), label='Sea Surface Temperature'"
           For full reference refer to
           https://matplotlib.org/api/lines_api.html and
           https://matplotlib.org/devdocs/api/_as_gen/matplotlib.patches.Patch.html#matplotlib.patches.Patch
    :param file: path to a file in which to save the plot
    :return: a matplotlib figure object or None if in IPython mode
    """
    ds = DatasetLike.convert(ds)

    var_name = VarName.convert(var)
    if not var_name:
        raise ValueError("Missing value for 'var'")
    var = ds[var_name]

    indexers = DictLike.convert(indexers)
    properties = DictLike.convert(properties) or {}

    figure = plt.figure()
    ax = figure.add_subplot(111)

    var_data = _get_var_data(var, indexers)
    var_data.plot(ax=ax, **properties)

    if title:
        ax.set_title(title)

    figure.tight_layout()

    if file:
        figure.savefig(file)

    return figure if not in_notebook() else None
Пример #22
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform monthly aggregation of a daily dataset according to the given
    method.

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValueError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Check if we have a daily dataset
    try:
        if ds.attrs['time_coverage_resolution'] != 'P1D':
            raise ValueError(
                'Temporal aggregation operation expects a daily dataset')
    except KeyError:
        raise ValueError('Could not determine temporal resolution. Running'
                         ' the adjust_temporal_attrs operation beforehand may'
                         ' help.')

    with monitor.observing("resample dataset"):
        retset = ds.resample(freq='MS',
                             dim='time',
                             keep_attrs=True,
                             how=method)

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                    retset[var].attrs['cell_methods'] + \
                    ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
Пример #23
0
def sel(ds: DatasetLike.TYPE,
        point: PointLike.TYPE = None,
        time: TimeLike.TYPE = None,
        indexers: DictLike.TYPE = None,
        method: str = 'nearest') -> xr.Dataset:
    """
    Return a new dataset with each array indexed by tick labels along the specified dimension(s).

    This is a wrapper for the ``xarray.sel()`` function.

    For documentation refer to xarray documentation at
    http://xarray.pydata.org/en/stable/generated/xarray.Dataset.sel.html#xarray.Dataset.sel

    :param ds: The dataset from which to select.
    :param point: Optional geographic point given by longitude and latitude
    :param time: Optional time
    :param indexers: Keyword arguments with names matching dimensions and values given by scalars,
           slices or arrays of tick labels. For dimensions with multi-index, the indexer may also be
           a dict-like object with keys matching index level names.
    :param method: Method to use for inexact matches:
           * None: only exact matches
           * ``pad`` / ``ffill``: propagate last valid index value forward
           * ``backfill`` / ``bfill``: propagate next valid index value backward
           * ``nearest`` (default): use nearest valid index value
    :return: A new Dataset with the same contents as this dataset, except each variable and dimension
             is indexed by the appropriate indexers. In general, each variable's data will be a view of the
             variable's data in this dataset.
    """
    ds = DatasetLike.convert(ds)
    point = PointLike.convert(point)
    time = TimeLike.convert(time)
    indexers = DictLike.convert(indexers)
    indexers = dict(indexers or {})
    if point is not None:
        indexers.setdefault('lon', point.x)
        indexers.setdefault('lat', point.y)
    if time is not None:
        indexers.setdefault('time', time)
    # Filter out non-existent coordinates
    indexers = {
        name: value
        for name, value in indexers.items() if name in ds.coords
    }
    return ds.sel(method=method, **indexers)
Пример #24
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean,
              'median': np.nanmedian, 'sum': np.nansum}

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [value for value in dim if value in retset[var_name].dims]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Пример #25
0
def sel(ds: DatasetLike.TYPE,
        point: PointLike.TYPE = None,
        time: TimeLike.TYPE = None,
        indexers: DictLike.TYPE = None,
        method: str = 'nearest') -> xr.Dataset:
    """
    Return a new dataset with each array indexed by tick labels along the specified dimension(s).

    This is a wrapper for the ``xarray.sel()`` function.

    For documentation refer to xarray documentation at
    http://xarray.pydata.org/en/stable/generated/xarray.Dataset.sel.html#xarray.Dataset.sel

    :param ds: The dataset from which to select.
    :param point: Optional geographic point given by longitude and latitude
    :param time: Optional time
    :param indexers: Keyword arguments with names matching dimensions and values given by scalars,
           slices or arrays of tick labels. For dimensions with multi-index, the indexer may also be
           a dict-like object with keys matching index level names.
    :param method: Method to use for inexact matches:
           * None: only exact matches
           * ``pad`` / ``ffill``: propagate last valid index value forward
           * ``backfill`` / ``bfill``: propagate next valid index value backward
           * ``nearest`` (default): use nearest valid index value
    :return: A new Dataset with the same contents as this dataset, except each variable and dimension
             is indexed by the appropriate indexers. In general, each variable's data will be a view of the
             variable's data in this dataset.
    """
    ds = DatasetLike.convert(ds)
    point = PointLike.convert(point)
    time = TimeLike.convert(time)
    indexers = DictLike.convert(indexers)
    indexers = dict(indexers or {})
    if point is not None:
        indexers.setdefault('lon', point.x)
        indexers.setdefault('lat', point.y)
    if time is not None:
        indexers.setdefault('time', time)
    # Filter out non-existent coordinates
    indexers = {name: value for name, value in indexers.items() if name in ds.coords}
    return ds.sel(method=method, **indexers)
Пример #26
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError(
            'Temporal aggregation operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError(
            'Could not determine temporal resolution of input dataset.'
            ' Running the adjust_temporal_attrs operation beforehand may'
            ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq,
                                                            keep_attrs=True))
        except AttributeError:
            raise ValidationError(
                f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(
                method)

    return adjust_temporal_attrs(retset)
Пример #27
0
def pearson_correlation_scalar(
        ds_x: DatasetLike.TYPE,
        ds_y: DatasetLike.TYPE,
        var_x: VarName.TYPE,
        var_y: VarName.TYPE,
        monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Performs a simple correlation analysis on two data variables and returns
    a correlation coefficient and the corresponding p_value.

    Positive correlation implies that as x grows, so does y. Negative
    correlation implies that as x increases, y decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: Data frame {'corr_coef': correlation coefficient, 'p_value': probability value}
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    if (array_x.dims != array_y.dims):
        raise ValidationError(
            'Both datasets should feature the same'
            ' dimensionality. Currently provided ds_x[var_x] '
            f'has {array_x.dims}, provided ds_y[var_y]'
            f' has {array_y.dims}')

    for dim in array_x.dims:
        if len(array_x[dim]) != len(array_y[dim]):
            raise ValidationError(
                'All dimensions of both provided data variables'
                f' must be the same length. Currently {dim} of ds_x[var_x]'
                f' has {len(array_x[dim])} values, while'
                f' {dim} of ds_y[var_y] has {len(array_y[dim])} values.'
                ' You may want to try to coregister the datasets beforehand.')

    n_vals = 1
    for dim in array_x.dims:
        n_vals = n_vals * len(array_x[dim])

    if n_vals < 3:
        raise ValidationError(
            'There should be no less than 3 values in both data variables'
            f' to perform the correlation. Currently there are {n_vals} values'
        )

    with monitor.observing("Calculate Pearson correlation"):
        cc, pv = pearsonr(array_x.stack(z=array_x.dims),
                          array_y.stack(z=array_y.dims))

    return pd.DataFrame({'corr_coef': [cc], 'p_value': [pv]})
Пример #28
0
def temporal_aggregation(ds: DatasetLike.TYPE,
                         method: str = 'mean',
                         output_resolution: str = 'month',
                         custom_resolution: str = None,
                         monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform aggregation of dataset according to the given
    method and output resolution.

    Note that the operation does not perform weighting. Depending on the
    combination of input and output resolutions, as well as aggregation
    method, the resulting dataset might yield unexpected results.

    Resolution 'month' will result in a monthly dataset with each month
    denoted by its first date. Resolution 'season' will result in a dataset
    aggregated to DJF, MAM, JJA, SON seasons, each denoted by the first
    date of the season.

    The operation also works with custom resolution strings, see:
    http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
    If ``custom_resolution`` is provided, it will override ``output_resolution``.

    Some examples:
      'QS-JUN' produces an output dataset on a quarterly resolution where the
      year ends in 1st of June and each quarter is denoted by its first date
      '8MS' produces an output dataset on an eight-month resolution where each
      period is denoted by the first date. Note that such periods will not be
      consistent over years.
      '8D' produces a dataset on an eight day resolution

    :param ds: Dataset to aggregate
    :param method: Aggregation method
    :param output_resolution: Desired temporal resolution of the output dataset
    :param custom_resolution: Custom temporal resolution, overrides output_resolution
    :return: Aggregated dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError('Temporal aggregation operation expects a dataset with the'
                              ' time coordinate of type datetime64[ns], but received'
                              ' {}. Running the normalize operation on this'
                              ' dataset may help'.format(ds.time.dtype))

    # Try to figure out the input frequency
    try:
        in_freq = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError('Could not determine temporal resolution of input dataset.'
                              ' Running the adjust_temporal_attrs operation beforehand may'
                              ' help.')

    if custom_resolution:
        freq = custom_resolution
    else:
        frequencies = {'month': 'MS', 'season': 'QS-DEC'}
        freq = frequencies[output_resolution]

    _validate_freq(in_freq, freq)

    with monitor.observing("resample dataset"):
        try:
            retset = getattr(resampler, method)(ds.resample(time=freq, keep_attrs=True))
        except AttributeError:
            raise ValidationError(f'Provided aggregation method {method} is not valid.')

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                retset[var].attrs['cell_methods'] + \
                ' time: {} within years'.format(method)
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: {} within years'.format(method)

    return adjust_temporal_attrs(retset)
Пример #29
0
def pearson_correlation(ds_x: DatasetLike.TYPE,
                        ds_y: DatasetLike.TYPE,
                        var_x: VarName.TYPE,
                        var_y: VarName.TYPE,
                        monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Perform Pearson correlation on two datasets and produce a lon/lat map of
    correlation coefficients and the correspoding p_values.

    In case two 3D lon/lat/time datasets are provided, pixel by pixel
    correlation will be performed. It is also possible two pro
    Perform Pearson correlation analysis on two time/lat/lon datasets and
    produce a lat/lon map of correlation coefficients and p_values of
    underlying timeseries in the provided datasets.

    The lat/lon definition of both datasets has to be the same. The length of
    the time dimension should be equal, but not neccessarily have the same
    definition. E.g., it is possible to correlate different times of the same
    area.

    There are 'x' and 'y' datasets. Positive correlations imply that as x
    grows, so does y. Negative correlations imply that as x increases, y
    decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: a dataset containing a map of correlation coefficients and p_values
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    # Further validate inputs
    if array_x.dims == array_y.dims:
        if len(array_x.dims) != 3 or len(array_y.dims) != 3:
            raise ValueError(
                'A correlation coefficient map can only be produced'
                ' if both provided datasets are 3D datasets with'
                ' lon/lat/time dimensionality, or if a combination'
                ' of a 3D lon/lat/time dataset and a 1D timeseries'
                ' is provided.')

        if array_x.values.shape != array_y.values.shape:
            raise ValueError('The provided variables {} and {} do not have the'
                             ' same shape, Pearson correlation can not be'
                             ' performed. Please review operation'
                             ' documentation'.format(var_x, var_y))

        if (not ds_x['lat'].equals(ds_y['lat'])
                or not ds_x['lon'].equals(ds_y['lon'])):
            raise ValueError('When performing a pixel by pixel correlation the'
                             ' datasets have to have the same lat/lon'
                             ' definition. Consider running coregistration'
                             ' first')

    elif (((len(array_x.dims) == 3) and (len(array_y.dims) != 1))
          or ((len(array_x.dims) == 1) and (len(array_y.dims) != 3))
          or ((len(array_x.dims) != 3) and (len(array_y.dims) == 1))
          or ((len(array_x.dims) != 1) and (len(array_y.dims) == 3))):
        raise ValueError('A correlation coefficient map can only be produced'
                         ' if both provided datasets are 3D datasets with'
                         ' lon/lat/time dimensionality, or if a combination'
                         ' of a 3D lon/lat/time dataset and a 1D timeseries'
                         ' is provided.')

    if len(array_x['time']) != len(array_y['time']):
        raise ValueError('The length of the time dimension differs between'
                         ' the given datasets. Can not perform the calculation'
                         ', please review operation documentation.')

    if len(array_x['time']) < 3:
        raise ValueError('The length of the time dimension should not be less'
                         ' than three to run the calculation.')

    # Do pixel by pixel correlation
    retset = _pearsonr(array_x, array_y, monitor)
    retset.attrs['Cate_Description'] = 'Correlation between {} {}'.format(
        var_y, var_x)

    return adjust_spatial_attrs(retset)
Пример #30
0
def detect_outliers(ds: xr.Dataset,
                    var: VarNamesLike.TYPE,
                    threshold_low: float = 0.05,
                    threshold_high: float = 0.95,
                    quantiles: bool = True,
                    mask: bool = False,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Detect outliers in the given Dataset.

    When mask=True the input dataset should not contain nan values, otherwise
    all existing nan values will be marked as 'outliers' in the mask data array
    added to the output dataset.

    :param ds: The dataset or dataframe for which to do outlier detection
    :param var: Variable or variables in the dataset to which to do outlier
    detection. Note that when multiple variables are selected, absolute
    threshold values might not make much sense. Wild cards can be used to
    select multiple variables matching a pattern.
    :param threshold_low: Values less or equal to this will be removed/masked
    :param threshold_high: Values greater or equal to this will be removed/masked
    :param quantiles: If True, threshold values are treated as quantiles,
    otherwise as absolute values.
    :param mask: If True, an ancillary variable containing flag values for
    outliers will be added to the dataset. Otherwise, outliers will be replaced
    with nan directly in the data variables.
    :param monitor: A progress monitor.
    :return: The dataset with outliers masked or replaced with nan
    """
    ds = DatasetLike.convert(ds)
    # Create a list of variable names on which to perform outlier detection
    # based on the input comma separated list that can contain wildcards
    var_patterns = VarNamesLike.convert(var)
    all_vars = list(ds.data_vars.keys())
    variables = list()
    for pattern in var_patterns:
        leave = fnmatch.filter(all_vars, pattern)
        variables = variables + leave

    # For each array in the dataset for which we should detect outliers, detect
    # outliers
    ret_ds = ds.copy()
    with monitor.starting("detect_outliers", total_work=len(variables) * 3):
        for var_name in variables:
            if quantiles:
                # Get threshold values
                with monitor.child(1).observing("quantile low"):
                    threshold_low = ret_ds[var_name].quantile(threshold_low)
                with monitor.child(1).observing("quantile high"):
                    threshold_high = ret_ds[var_name].quantile(threshold_high)
            else:
                monitor.progress(2)
            # If not mask, put nans in the data arrays for min/max outliers
            if not mask:
                arr = ret_ds[var_name]
                attrs = arr.attrs
                ret_ds[var_name] = arr.where((arr > threshold_low)
                                             & (arr < threshold_high))
                ret_ds[var_name].attrs = attrs
            else:
                # Create and add a data variable containing the mask for this data
                # variable
                _mask_outliers(ret_ds, var_name, threshold_low, threshold_high)
            monitor.progress(1)

    return ret_ds
Пример #31
0
def plot_line(ds: DatasetLike.TYPE,
              var_names: VarNamesLike.TYPE,
              fmt: str = None,
              label: DimName.TYPE = None,
              indexers: DictLike.TYPE = None,
              title: str = None,
              file: str = None) -> Figure:
    """
    Create a 1D/line plot of variable(s) given by dataset *ds* and variable name(s) *var_names*.

    :param ds: Dataset or Dataframe that contains the variable(s) named by *var_names*.
    :param var_names: The name of the variable(s) to plot
    :param fmt: optional semicolon-separated matplotlib formats,
           e.g.
           1 variable - "b.-"
           2 variables - "b.-;r+:"
           If the number of properties is less than the number of selected variables, the next non-corresponding
           variable will repeat the first style on the list, and so on.
           For full reference on matplotlib plot() function, refer to
           https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html
    :param file: path to a file in which to save the plot
    :param label: dimension name to be selected as the x-axis of the plot
    :param indexers: Optional indexers into data array of *var_names*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "lat=12.4, time='2012-05-02'".
    :param title: an optional plot title
    :return: a matplotlib figure object or None if in IPython mode
    """
    ds = DatasetLike.convert(ds)

    fmt_count = 0
    fmt_list = []

    if fmt:
        fmt_list = fmt.split(";")
        fmt_count = len(fmt_list)

    if not var_names:
        raise ValidationError("Missing name for 'vars'")

    figure = plt.figure()
    ax = figure.add_subplot(111)
    figure.subplots_adjust(right=0.65)

    var_names = VarNamesLike.convert(var_names)
    if not title:
        if label:
            title = ','.join(var_names) + ' over ' + label
        else:
            title = ','.join(var_names)
    if indexers:
        title = title + '\n' + ' at ' + json.dumps(indexers).strip('"')
    ax.set_title(title)

    indexers = DictLike.convert(indexers)

    ax_var = {}
    var_count = len(var_names)
    predefined_fmt = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
    if label:
        ds = get_vars_data(ds, indexers, remaining_dims=[label])
    else:
        ds = get_vars_data(ds, indexers)

    for i in range(var_count):
        var_name = var_names[i]
        var = ds[var_name]
        if len(var.dims) > 1:
            raise ValidationError(f'Unable to plot because variable {var_name} has more than one dimension: {var.dims}.'
                                  f' To specify value(s) of these dimension(s), please use the indexers.')

        var_label = var_name + ' (' + var.attrs['units'] + ')' if 'units' in var.attrs else var_name
        properties_dict = {}

        indexers = DictLike.convert(indexers)

        if fmt is None:
            selected_fmt = predefined_fmt[i % len(predefined_fmt)]
        else:
            selected_fmt = fmt_list[i % fmt_count]

        if label:
            x_axis = var[label]
        elif 'time' in var:
            x_axis = var.time
        else:
            x_axis = []
        # to differentiate the creation of y-axis of the first and the nth variable
        if i == 0:
            if len(x_axis) > 0:
                ax.plot(x_axis, var, selected_fmt, **properties_dict)
            else:
                ax.plot(var, selected_fmt, **properties_dict)
            ax.set_ylabel(var_label, wrap=True)
            ax.yaxis.label.set_color(selected_fmt[0])
            ax.tick_params(axis='y', colors=selected_fmt[0])
        else:
            ax_var[var_name] = ax.twinx()
            if len(ax_var) > 1:
                ax_var[var_name].spines["right"].set_position(("axes", 1 + ((i - 1) * 0.2)))
                ax_var[var_name].set_frame_on(True)
                ax_var[var_name].patch.set_visible(False)
            if len(x_axis) > 0:
                ax_var[var_name].plot(x_axis, var, selected_fmt, **properties_dict)
            else:
                ax_var[var_name].plot(var, selected_fmt, **properties_dict)
            ax_var[var_name].set_ylabel(var_label, wrap=True)
            ax_var[var_name].yaxis.label.set_color(selected_fmt[0])
            ax_var[var_name].tick_params(axis='y', colors=selected_fmt[0])

    ax.tick_params(axis='x', rotation=45)
    if label in ds and 'long_name' in ds[label].attrs:
        ax.set_xlabel(ds[label].attrs['long_name'])
    figure.tight_layout()

    if file:
        figure.savefig(file, dpi=600)

    return figure if not in_notebook() else None
Пример #32
0
def pearson_correlation(ds_x: DatasetLike.TYPE,
                        ds_y: DatasetLike.TYPE,
                        var_x: VarName.TYPE,
                        var_y: VarName.TYPE,
                        monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Do product moment `Pearson's correlation <http://www.statsoft.com/Textbook/Statistics-Glossary/P/button/p#Pearson%20Correlation>`_ analysis.

    Perform Pearson correlation on two datasets and produce a lon/lat map of
    correlation coefficients and the correspoding p_values.

    In case two 3D lon/lat/time datasets are provided, pixel by pixel
    correlation will be performed. It is also possible two pro
    Perform Pearson correlation analysis on two time/lat/lon datasets and
    produce a lat/lon map of correlation coefficients and p_values of
    underlying timeseries in the provided datasets.

    The lat/lon definition of both datasets has to be the same. The length of
    the time dimension should be equal, but not neccessarily have the same
    definition. E.g., it is possible to correlate different times of the same
    area.

    There are 'x' and 'y' datasets. Positive correlations imply that as x
    grows, so does y. Negative correlations imply that as x increases, y
    decreases.

    For more information how to interpret the results, see
    `here <http://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/regression/how-to/correlation/interpret-the-results/>`_,
    and `here <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html>`_.

    :param ds_x: The 'x' dataset
    :param ds_y: The 'y' dataset
    :param var_x: Dataset variable to use for correlation analysis in the 'variable' dataset
    :param var_y: Dataset variable to use for correlation analysis in the 'dependent' dataset
    :param monitor: a progress monitor.
    :return: a dataset containing a map of correlation coefficients and p_values
    """
    ds_x = DatasetLike.convert(ds_x)
    ds_y = DatasetLike.convert(ds_y)
    var_x = VarName.convert(var_x)
    var_y = VarName.convert(var_y)

    array_y = ds_y[var_y]
    array_x = ds_x[var_x]

    # Further validate inputs
    if array_x.dims == array_y.dims:
        if len(array_x.dims) != 3 or len(array_y.dims) != 3:
            raise ValidationError('A correlation coefficient map can only be produced'
                                  ' if both provided datasets are 3D datasets with'
                                  ' lon/lat/time dimensionality, or if a combination'
                                  ' of a 3D lon/lat/time dataset and a 1D timeseries'
                                  ' is provided.')

        if array_x.values.shape != array_y.values.shape:
            raise ValidationError(f'The provided variables {var_x} and {var_y} do not have the'
                                  ' same shape, Pearson correlation can not be'
                                  ' performed. Please review operation'
                                  ' documentation')

        if (not ds_x['lat'].equals(ds_y['lat']) or not ds_x['lon'].equals(ds_y['lon'])):
            raise ValidationError('When performing a pixel by pixel correlation the'
                                  ' datasets have to have the same lat/lon'
                                  ' definition. Consider running coregistration'
                                  ' first')

    elif (((len(array_x.dims) == 3) and (len(array_y.dims) != 1))
          or ((len(array_x.dims) == 1) and (len(array_y.dims) != 3))
          or ((len(array_x.dims) != 3) and (len(array_y.dims) == 1))
          or ((len(array_x.dims) != 1) and (len(array_y.dims) == 3))):
        raise ValidationError('A correlation coefficient map can only be produced'
                              ' if both provided datasets are 3D datasets with'
                              ' lon/lat/time dimensionality, or if a combination'
                              ' of a 3D lon/lat/time dataset and a 1D timeseries'
                              ' is provided.')

    if len(array_x['time']) != len(array_y['time']):
        raise ValidationError('The length of the time dimension differs between'
                              ' the given datasets. Can not perform the calculation'
                              ', please review operation documentation.')

    if len(array_x['time']) < 3:
        raise ValidationError('The length of the time dimension should not be less'
                              ' than three to run the calculation.')

    # Do pixel by pixel correlation
    retset = _pearsonr(array_x, array_y, monitor)
    retset.attrs['Cate_Description'] = f'Correlation between {var_y} {var_x}'

    return adjust_spatial_attrs(retset)
Пример #33
0
def plot_line(ds: DatasetLike.TYPE,
              var_names: VarNamesLike.TYPE,
              fmt: str = None,
              label: DimName.TYPE = None,
              indexers: DictLike.TYPE = None,
              title: str = None,
              file: str = None) -> Figure:
    """
    Create a 1D/line plot of variable(s) given by dataset *ds* and variable name(s) *var_names*.

    :param ds: Dataset or Dataframe that contains the variable(s) named by *var_names*.
    :param var_names: The name of the variable(s) to plot
    :param fmt: optional semicolon-separated matplotlib formats,
           e.g.
           1 variable - "b.-"
           2 variables - "b.-;r+:"
           If the number of properties is less than the number of selected variables, the next non-corresponding
           variable will repeat the first style on the list, and so on.
           For full reference on matplotlib plot() function, refer to
           https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html
    :param file: path to a file in which to save the plot
    :param label: dimension name to be selected as the x-axis of the plot
    :param indexers: Optional indexers into data array of *var_names*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "lat=12.4, time='2012-05-02'".
    :param title: an optional plot title
    :return: a matplotlib figure object or None if in IPython mode
    """
    ds = DatasetLike.convert(ds)

    fmt_count = 0
    fmt_list = []

    if fmt:
        fmt_list = fmt.split(";")
        fmt_count = len(fmt_list)

    if not var_names:
        raise ValidationError("Missing name for 'vars'")

    figure = plt.figure()
    ax = figure.add_subplot(111)
    figure.subplots_adjust(right=0.65)

    var_names = VarNamesLike.convert(var_names)
    if not title:
        if label:
            title = ','.join(var_names) + ' over ' + label
        else:
            title = ','.join(var_names)
    if indexers:
        title = title + '\n' + ' at ' + json.dumps(indexers).strip('"')
    ax.set_title(title)

    indexers = DictLike.convert(indexers)

    ax_var = {}
    var_count = len(var_names)
    predefined_fmt = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
    if label:
        ds = get_vars_data(ds, indexers, remaining_dims=[label])
    else:
        ds = get_vars_data(ds, indexers)

    for i in range(var_count):
        var_name = var_names[i]
        var = ds[var_name]
        if len(var.dims) > 1:
            raise ValidationError(
                f'Unable to plot because variable {var_name} has more than one dimension: {var.dims}.'
                f' To specify value(s) of these dimension(s), please use the indexers.'
            )

        var_label = var_name + ' (' + var.attrs[
            'units'] + ')' if 'units' in var.attrs else var_name
        properties_dict = {}

        indexers = DictLike.convert(indexers)

        if fmt is None:
            selected_fmt = predefined_fmt[i % len(predefined_fmt)]
        else:
            selected_fmt = fmt_list[i % fmt_count]

        if label:
            x_axis = var[label]
        elif 'time' in var:
            x_axis = var.time
        else:
            x_axis = []
        # to differentiate the creation of y-axis of the first and the nth variable
        if i == 0:
            if len(x_axis) > 0:
                ax.plot(x_axis, var, selected_fmt, **properties_dict)
            else:
                ax.plot(var, selected_fmt, **properties_dict)
            ax.set_ylabel(var_label, wrap=True)
            ax.yaxis.label.set_color(selected_fmt[0])
            ax.tick_params(axis='y', colors=selected_fmt[0])
        else:
            ax_var[var_name] = ax.twinx()
            if len(ax_var) > 1:
                ax_var[var_name].spines["right"].set_position(
                    ("axes", 1 + ((i - 1) * 0.2)))
                ax_var[var_name].set_frame_on(True)
                ax_var[var_name].patch.set_visible(False)
            if len(x_axis) > 0:
                ax_var[var_name].plot(x_axis, var, selected_fmt,
                                      **properties_dict)
            else:
                ax_var[var_name].plot(var, selected_fmt, **properties_dict)
            ax_var[var_name].set_ylabel(var_label, wrap=True)
            ax_var[var_name].yaxis.label.set_color(selected_fmt[0])
            ax_var[var_name].tick_params(axis='y', colors=selected_fmt[0])

    ax.tick_params(axis='x', rotation=45)
    if label in ds and 'long_name' in ds[label].attrs:
        ax.set_xlabel(ds[label].attrs['long_name'])
    figure.tight_layout()

    if file:
        figure.savefig(file, dpi=600)

    return figure if not in_notebook() else None
Пример #34
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform long term average of the given dataset by doing a mean of monthly
    values over the time range covered by the dataset. E.g. it averages all
    January values, all February values, etc, to create a dataset with twelve
    time slices each containing a mean of respective monthly values.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A monthly dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValueError(
            'Long term average operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Check if we have a monthly dataset
    try:
        if ds.attrs['time_coverage_resolution'] != 'P1M':
            raise ValueError(
                'Long term average operation expects a monthly dataset'
                ' running temporal aggregation on this dataset'
                ' beforehand may help.')
    except KeyError:
        raise ValueError('Could not determine temporal resolution. Running'
                         ' the adjust_temporal_attrs operation beforehand may'
                         ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    time_min = pd.Timestamp(ds.time.values[0])
    time_max = pd.Timestamp(ds.time.values[-1])

    total_work = 100

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / 12
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month',
                                squeeze=False).apply(_mean, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.rename({'month': 'time'})
    retset['time'] = pd.date_range('{}-01-01'.format(time_min.year),
                                   freq='MS',
                                   periods=12)

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (12, 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                    retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset
Пример #35
0
    def test_format(self):
        self.assertEqual(DatasetLike.format(None), '')

        with self.assertRaises(ValidationError):
            data = {'v1': [4, 5, 6], 'v2': [6, 7, 8]}
            DatasetLike.format(xr.Dataset(data_vars=data))
Пример #36
0
    def test_format(self):
        self.assertEqual(DatasetLike.format(None), '')

        with self.assertRaises(ValidationError):
            data = {'v1': [4, 5, 6], 'v2': [6, 7, 8]}
            DatasetLike.format(xr.Dataset(data_vars=data))
Пример #37
0
def detect_outliers(ds: xr.Dataset,
                    var: VarNamesLike.TYPE,
                    threshold_low: float = 0.05,
                    threshold_high: float = 0.95,
                    quantiles: bool = True,
                    mask: bool = False,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Detect outliers in the given Dataset.

    When mask=True the input dataset should not contain nan values, otherwise
    all existing nan values will be marked as 'outliers' in the mask data array
    added to the output dataset.

    :param ds: The dataset or dataframe for which to do outlier detection
    :param var: Variable or variables in the dataset to which to do outlier
    detection. Note that when multiple variables are selected, absolute
    threshold values might not make much sense. Wild cards can be used to
    select multiple variables matching a pattern.
    :param threshold_low: Values less or equal to this will be removed/masked
    :param threshold_high: Values greater or equal to this will be removed/masked
    :param quantiles: If True, threshold values are treated as quantiles,
    otherwise as absolute values.
    :param mask: If True, an ancillary variable containing flag values for
    outliers will be added to the dataset. Otherwise, outliers will be replaced
    with nan directly in the data variables.
    :param monitor: A progress monitor.
    :return: The dataset with outliers masked or replaced with nan
    """
    ds = DatasetLike.convert(ds)
    # Create a list of variable names on which to perform outlier detection
    # based on the input comma separated list that can contain wildcards
    var_patterns = VarNamesLike.convert(var)
    all_vars = list(ds.data_vars.keys())
    variables = list()
    for pattern in var_patterns:
        leave = fnmatch.filter(all_vars, pattern)
        variables = variables + leave

    # For each array in the dataset for which we should detect outliers, detect
    # outliers
    ret_ds = ds.copy()
    with monitor.starting("detect_outliers", total_work=len(variables) * 3):
        for var_name in variables:
            if quantiles:
                # Get threshold values
                with monitor.child(1).observing("quantile low"):
                    threshold_low = ret_ds[var_name].quantile(threshold_low)
                with monitor.child(1).observing("quantile high"):
                    threshold_high = ret_ds[var_name].quantile(threshold_high)
            else:
                monitor.progress(2)
            # If not mask, put nans in the data arrays for min/max outliers
            if not mask:
                arr = ret_ds[var_name]
                attrs = arr.attrs
                ret_ds[var_name] = arr.where((arr > threshold_low) & (arr < threshold_high))
                ret_ds[var_name].attrs = attrs
            else:
                # Create and add a data variable containing the mask for this data
                # variable
                _mask_outliers(ret_ds, var_name, threshold_low, threshold_high)
            monitor.progress(1)

    return ret_ds