Exemplo n.º 1
0
    def test_convert(self):
        expected = ['aa', 'b*', 'cc']
        actual = VarNamesLike.convert('aa,b*,cc')
        self.assertEqual(actual, expected)

        with self.assertRaises(ValidationError) as err:
            VarNamesLike.convert(['aa', 1, 'bb'])
        self.assertEqual(str(err.exception), 'List of variables names expected.')
        self.assertEqual(None, VarNamesLike.convert(None))
Exemplo n.º 2
0
def select_var(ds: DatasetLike.TYPE, var: VarNamesLike.TYPE = None) -> xr.Dataset:
    """
    Filter the dataset, by leaving only the desired variables in it. The original dataset
    information, including original coordinates, is preserved.

    :param ds: The dataset or dataframe from which to perform selection.
    :param var: One or more variable names to select and preserve in the dataset. \
    All of these are valid 'var_name' 'var_name1,var_name2,var_name3' ['var_name1', 'var_name2']. \
    One can also use wildcards when doing the selection. E.g., choosing 'var_name*' for selection \
    will select all variables that start with 'var_name'. This can be used to select variables \
    along with their auxiliary variables, to select all uncertainty variables, and so on.
    :return: A filtered dataset
    """
    if not var:
        return ds

    ds = DatasetLike.convert(ds)

    var_names = VarNamesLike.convert(var)
    dropped_var_names = list(ds.data_vars.keys())

    for pattern in var_names:
        keep = fnmatch.filter(dropped_var_names, pattern)
        for name in keep:
            dropped_var_names.remove(name)

    return ds.drop(dropped_var_names)
Exemplo n.º 3
0
def data_frame_subset(gdf: gpd.GeoDataFrame,
                      region_op: bool = 'intersects',
                      region: PolygonLike.TYPE = None,
                      var_names: VarNamesLike.TYPE = None) -> gpd.GeoDataFrame:
    """
    Create a GeoDataFrame subset from given variables (data frame columns) and/or region.

    :param gdf: A GeoDataFrame.
    :param region_op: The geometric operation to be performed if *region* is given.
    :param region: A region polygon used to filter rows.
    :param var_names: The variables (columns) to select.
    :return: A GeoDataFrame subset.
    """

    region = PolygonLike.convert(region)

    var_names = VarNamesLike.convert(var_names)

    if not var_names and not region:
        return gdf

    if var_names:
        if 'geometry' not in var_names:
            var_names = ['geometry'] + var_names
        gdf = gdf[var_names]

    if region and region_op:
        geom_str = PolygonLike.format(region)
        gdf = data_frame_query(gdf, f'@{region_op}("{geom_str}")')

    return gdf
Exemplo n.º 4
0
 def test_accepts(self):
     self.assertTrue(VarNamesLike.accepts('aa'))
     self.assertTrue(VarNamesLike.accepts('aa,bb,cc'))
     self.assertTrue(VarNamesLike.accepts(['aa', 'bb', 'cc']))
     self.assertFalse(VarNamesLike.accepts(1.0))
     self.assertFalse(VarNamesLike.accepts([1, 2, 4]))
     self.assertFalse(VarNamesLike.accepts(['aa', 2, 'bb']))
Exemplo n.º 5
0
    def make_local(self,
                   local_name: str,
                   local_id: str = None,
                   time_range: TimeRangeLike.TYPE = None,
                   region: PolygonLike.TYPE = None,
                   var_names: VarNamesLike.TYPE = None,
                   monitor: Monitor = Monitor.NONE) -> Optional[DataSource]:

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        local_store = DATA_STORE_REGISTRY.get_data_store('local')
        if not local_store:
            add_to_data_store_registry()
            local_store = DATA_STORE_REGISTRY.get_data_store('local')
        if not local_store:
            raise ValueError('Cannot initialize `local` DataStore')

        _uuid = LocalDataStore.generate_uuid(ref_id=self.id, time_range=time_range, region=region, var_names=var_names)

        if not local_name or len(local_name) == 0:
            local_name = "local.{}.{}".format(self.id, _uuid)
            existing_ds_list = local_store.query(ds_id=local_name)
            if len(existing_ds_list) == 1:
                return existing_ds_list[0]
        else:
            existing_ds_list = local_store.query(ds_id='local.%s' % local_name)
            if len(existing_ds_list) == 1:
                if existing_ds_list[0].meta_info.get('uuid', None) == _uuid:
                    return existing_ds_list[0]
                else:
                    raise ValueError('Datastore {} already contains dataset {}'.format(local_store.id, local_name))

        local_meta_info = self.meta_info.copy()
        local_meta_info['ref_uuid'] = local_meta_info.get('uuid', None)
        local_meta_info['uuid'] = _uuid

        local_ds = local_store.create_data_source(local_name, region, local_name,
                                                  time_range=time_range, var_names=var_names,
                                                  meta_info=self.meta_info.copy())
        if local_ds:
            if not local_ds.is_complete:
                self._make_local(local_ds, time_range, region, var_names, monitor=monitor)

            if local_ds.is_empty:
                local_store.remove_data_source(local_ds)
                return None

            local_store.register_ds(local_ds)
            return local_ds
        return None
Exemplo n.º 6
0
    def generate_uuid(cls, ref_id: str,
                      time_range: Optional[TimeRange] = None,
                      region: Optional[shapely.geometry.Polygon] = None,
                      var_names: Optional[VarNames] = None) -> str:

        if time_range:
            ref_id += TimeRangeLike.format(time_range)
        if region:
            ref_id += PolygonLike.format(region)
        if var_names:
            ref_id += VarNamesLike.format(var_names)

        return str(uuid.uuid3(_NAMESPACE, ref_id))
Exemplo n.º 7
0
    def generate_title(cls, title: str,
                       time_range: Optional[TimeRange] = None,
                       region: Optional[shapely.geometry.Polygon] = None,
                       var_names: Optional[VarNames] = None) -> str:

        if time_range:
            title += " [TimeRange:{}]".format(TimeRangeLike.format(time_range))
        if region:
            title += " [Region:{}]".format(PolygonLike.format(region))
        if var_names:
            title += " [Variables:{}]".format(VarNamesLike.format(var_names))

        return title
Exemplo n.º 8
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Create a 'mean over years' dataset by averaging the values of the given input
    dataset over all years. The output is a climatological dataset with the same
    resolution as the input dataset. E.g. a daily input dataset will create a daily
    climatology consisting of 365 days, a monthly input dataset will create a monthly
    climatology, etc.

    Seasonal input datasets must have matching seasons over all years denoted by the
    same date each year. E.g., first date of each quarter. The output dataset will
    then be a seasonal climatology where each season is denoted with the same date
    as in the input dataset.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValidationError('Long term average operation expects a dataset with the'
                              ' time coordinate of type datetime64[ns], but received'
                              ' {}. Running the normalize operation on this'
                              ' dataset may help'.format(ds.time.dtype))

    try:
        t_resolution = ds.attrs['time_coverage_resolution']
    except KeyError:
        raise ValidationError('Could not determine temporal resolution. Running'
                              ' the adjust_temporal_attrs operation beforehand may'
                              ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    if t_resolution == 'P1D':
        return _lta_daily(retset, monitor)
    elif t_resolution == 'P1M':
        return _lta_monthly(retset, monitor)
    else:
        return _lta_general(retset, monitor)
Exemplo n.º 9
0
    def open_dataset(self,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     var_names: VarNamesLike.TYPE = None,
                     protocol: str = None,
                     monitor: Monitor = Monitor.NONE) -> Any:
        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        paths = []
        if time_range:
            time_series = list(self._files.values())
            file_paths = list(self._files.keys())
            for i in range(len(time_series)):
                if time_series[i]:
                    if isinstance(time_series[i], Tuple) and \
                            time_series[i][0] >= time_range[0] and \
                            time_series[i][1] <= time_range[1]:
                        paths.extend(self._resolve_file_path(file_paths[i]))
                    elif isinstance(time_series[i], datetime) and time_range[0] <= time_series[i] < time_range[1]:
                        paths.extend(self._resolve_file_path(file_paths[i]))
        else:
            for file in self._files.items():
                paths.extend(self._resolve_file_path(file[0]))

        if not paths:
            raise self._empty_error(time_range)

        paths = sorted(set(paths))
        try:
            excluded_variables = self._meta_info.get('exclude_variables')
            if excluded_variables:
                drop_variables = [variable.get('name') for variable in excluded_variables]
            else:
                drop_variables = None
            # TODO: combine var_names and drop_variables
            return open_xarray_dataset(paths,
                                       region=region,
                                       var_names=var_names,
                                       drop_variables=drop_variables,
                                       monitor=monitor)
        except HTTPError as e:
            raise self._cannot_access_error(time_range, region, var_names,
                                            verb="open", cause=e) from e
        except (URLError, socket.timeout) as e:
            raise self._cannot_access_error(time_range, region, var_names,
                                            verb="open", cause=e, error_cls=NetworkError) from e
        except OSError as e:
            raise self._cannot_access_error(time_range, region, var_names,
                                            verb="open", cause=e) from e
Exemplo n.º 10
0
    def __init__(self,
                 ds_id: str,
                 files: Union[Sequence[str], OrderedDict],
                 data_store: 'LocalDataStore',
                 temporal_coverage: TimeRangeLike.TYPE = None,
                 spatial_coverage: PolygonLike.TYPE = None,
                 variables: VarNamesLike.TYPE = None,
                 meta_info: dict = None,
                 status: DataSourceStatus = None):
        self._id = ds_id
        if isinstance(files, Sequence):
            self._files = OrderedDict.fromkeys(files)
        else:
            self._files = files
        self._data_store = data_store

        initial_temporal_coverage = TimeRangeLike.convert(temporal_coverage) if temporal_coverage else None
        if not initial_temporal_coverage:
            files_number = len(self._files.items())
            if files_number > 0:
                files_range = list(self._files.values())
                if files_range:
                    if isinstance(files_range[0], Tuple):
                        initial_temporal_coverage = TimeRangeLike.convert(tuple([files_range[0][0],
                                                                                 files_range[files_number - 1][1]]))
                    elif isinstance(files_range[0], datetime):
                        initial_temporal_coverage = TimeRangeLike.convert((files_range[0],
                                                                           files_range[files_number - 1]))

        self._temporal_coverage = initial_temporal_coverage
        self._spatial_coverage = PolygonLike.convert(spatial_coverage) if spatial_coverage else None
        self._variables = VarNamesLike.convert(variables) if variables else []

        self._meta_info = meta_info if meta_info else OrderedDict()

        if self._variables and not self._meta_info.get('variables', None):
            self._meta_info['variables'] = [
                {'name': var_name,
                 'units': '',
                 'long_name': '',
                 'standard_name': ''
                 } for var_name in self._variables]

        self._status = status if status else DataSourceStatus.READY
Exemplo n.º 11
0
def reduce(ds: DatasetLike.TYPE,
           var: VarNamesLike.TYPE = None,
           dim: DimNamesLike.TYPE = None,
           method: str = 'mean',
           monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Reduce the given variables of the given dataset along the given dimensions.
    If no variables are given, all variables of the dataset will be reduced. If
    no dimensions are given, all dimensions will be reduced. If no variables
    have been given explicitly, it can be set that only variables featuring numeric
    values should be reduced.

    :param ds: Dataset to reduce
    :param var: Variables in the dataset to reduce
    :param dim: Dataset dimensions along which to reduce
    :param method: reduction method
    :param monitor: A progress monitor
    """
    ufuncs = {'min': np.nanmin, 'max': np.nanmax, 'mean': np.nanmean,
              'median': np.nanmedian, 'sum': np.nansum}

    ds = DatasetLike.convert(ds)

    if not var:
        var = list(ds.data_vars.keys())
    var_names = VarNamesLike.convert(var)

    if not dim:
        dim = list(ds.coords.keys())
    else:
        dim = DimNamesLike.convert(dim)

    retset = ds.copy()

    for var_name in var_names:
        intersection = [value for value in dim if value in retset[var_name].dims]
        with monitor.starting("Reduce dataset", total_work=100):
            monitor.progress(5)
            with monitor.child(95).observing("Reduce"):
                retset[var_name] = retset[var_name].reduce(ufuncs[method],
                                                           dim=intersection,
                                                           keep_attrs=True)

    return retset
Exemplo n.º 12
0
def detect_outliers(ds: xr.Dataset,
                    var: VarNamesLike.TYPE,
                    threshold_low: float = 0.05,
                    threshold_high: float = 0.95,
                    quantiles: bool = True,
                    mask: bool = False,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Detect outliers in the given Dataset.

    When mask=True the input dataset should not contain nan values, otherwise
    all existing nan values will be marked as 'outliers' in the mask data array
    added to the output dataset.

    :param ds: The dataset or dataframe for which to do outlier detection
    :param var: Variable or variables in the dataset to which to do outlier
    detection. Note that when multiple variables are selected, absolute
    threshold values might not make much sense. Wild cards can be used to
    select multiple variables matching a pattern.
    :param threshold_low: Values less or equal to this will be removed/masked
    :param threshold_high: Values greater or equal to this will be removed/masked
    :param quantiles: If True, threshold values are treated as quantiles,
    otherwise as absolute values.
    :param mask: If True, an ancillary variable containing flag values for
    outliers will be added to the dataset. Otherwise, outliers will be replaced
    with nan directly in the data variables.
    :param monitor: A progress monitor.
    :return: The dataset with outliers masked or replaced with nan
    """
    ds = DatasetLike.convert(ds)
    # Create a list of variable names on which to perform outlier detection
    # based on the input comma separated list that can contain wildcards
    var_patterns = VarNamesLike.convert(var)
    all_vars = list(ds.data_vars.keys())
    variables = list()
    for pattern in var_patterns:
        leave = fnmatch.filter(all_vars, pattern)
        variables = variables + leave

    # For each array in the dataset for which we should detect outliers, detect
    # outliers
    ret_ds = ds.copy()
    with monitor.starting("detect_outliers", total_work=len(variables) * 3):
        for var_name in variables:
            if quantiles:
                # Get threshold values
                with monitor.child(1).observing("quantile low"):
                    threshold_low = ret_ds[var_name].quantile(threshold_low)
                with monitor.child(1).observing("quantile high"):
                    threshold_high = ret_ds[var_name].quantile(threshold_high)
            else:
                monitor.progress(2)
            # If not mask, put nans in the data arrays for min/max outliers
            if not mask:
                arr = ret_ds[var_name]
                attrs = arr.attrs
                ret_ds[var_name] = arr.where((arr > threshold_low) & (arr < threshold_high))
                ret_ds[var_name].attrs = attrs
            else:
                # Create and add a data variable containing the mask for this data
                # variable
                _mask_outliers(ret_ds, var_name, threshold_low, threshold_high)
            monitor.progress(1)

    return ret_ds
Exemplo n.º 13
0
 def test_format(self):
     self.assertEqual(VarNamesLike.format(['aa', 'bb', 'cc']), "aa, bb, cc")
     self.assertEqual(VarNamesLike.format(['aa']), "aa")
     self.assertEqual(VarNamesLike.format([]), "")
     self.assertEqual(VarNamesLike.format(None), "")
Exemplo n.º 14
0
 def test_format(self):
     actual = VarNamesLike.format(HTML('abc'))
     self.assertIsInstance(actual, str)
     self.assertEqual(actual, 'abc')
Exemplo n.º 15
0
 def test_format(self):
     self.assertEqual(VarNamesLike.format(['aa', 'bb', 'cc']),
                      "['aa', 'bb', 'cc']")
Exemplo n.º 16
0
def plot_line(ds: DatasetLike.TYPE,
              var_names: VarNamesLike.TYPE,
              fmt: str = None,
              label: DimName.TYPE = None,
              indexers: DictLike.TYPE = None,
              title: str = None,
              file: str = None) -> Figure:
    """
    Create a 1D/line plot of variable(s) given by dataset *ds* and variable name(s) *var_names*.

    :param ds: Dataset or Dataframe that contains the variable(s) named by *var_names*.
    :param var_names: The name of the variable(s) to plot
    :param fmt: optional semicolon-separated matplotlib formats,
           e.g.
           1 variable - "b.-"
           2 variables - "b.-;r+:"
           If the number of properties is less than the number of selected variables, the next non-corresponding
           variable will repeat the first style on the list, and so on.
           For full reference on matplotlib plot() function, refer to
           https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html
    :param file: path to a file in which to save the plot
    :param label: dimension name to be selected as the x-axis of the plot
    :param indexers: Optional indexers into data array of *var_names*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "lat=12.4, time='2012-05-02'".
    :param title: an optional plot title
    :return: a matplotlib figure object or None if in IPython mode
    """
    ds = DatasetLike.convert(ds)

    fmt_count = 0
    fmt_list = []

    if fmt:
        fmt_list = fmt.split(";")
        fmt_count = len(fmt_list)

    if not var_names:
        raise ValidationError("Missing name for 'vars'")

    figure = plt.figure()
    ax = figure.add_subplot(111)
    figure.subplots_adjust(right=0.65)

    var_names = VarNamesLike.convert(var_names)
    if not title:
        if label:
            title = ','.join(var_names) + ' over ' + label
        else:
            title = ','.join(var_names)
    if indexers:
        title = title + '\n' + ' at ' + json.dumps(indexers).strip('"')
    ax.set_title(title)

    indexers = DictLike.convert(indexers)

    ax_var = {}
    var_count = len(var_names)
    predefined_fmt = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
    if label:
        ds = get_vars_data(ds, indexers, remaining_dims=[label])
    else:
        ds = get_vars_data(ds, indexers)

    for i in range(var_count):
        var_name = var_names[i]
        var = ds[var_name]
        if len(var.dims) > 1:
            raise ValidationError(f'Unable to plot because variable {var_name} has more than one dimension: {var.dims}.'
                                  f' To specify value(s) of these dimension(s), please use the indexers.')

        var_label = var_name + ' (' + var.attrs['units'] + ')' if 'units' in var.attrs else var_name
        properties_dict = {}

        indexers = DictLike.convert(indexers)

        if fmt is None:
            selected_fmt = predefined_fmt[i % len(predefined_fmt)]
        else:
            selected_fmt = fmt_list[i % fmt_count]

        if label:
            x_axis = var[label]
        elif 'time' in var:
            x_axis = var.time
        else:
            x_axis = []
        # to differentiate the creation of y-axis of the first and the nth variable
        if i == 0:
            if len(x_axis) > 0:
                ax.plot(x_axis, var, selected_fmt, **properties_dict)
            else:
                ax.plot(var, selected_fmt, **properties_dict)
            ax.set_ylabel(var_label, wrap=True)
            ax.yaxis.label.set_color(selected_fmt[0])
            ax.tick_params(axis='y', colors=selected_fmt[0])
        else:
            ax_var[var_name] = ax.twinx()
            if len(ax_var) > 1:
                ax_var[var_name].spines["right"].set_position(("axes", 1 + ((i - 1) * 0.2)))
                ax_var[var_name].set_frame_on(True)
                ax_var[var_name].patch.set_visible(False)
            if len(x_axis) > 0:
                ax_var[var_name].plot(x_axis, var, selected_fmt, **properties_dict)
            else:
                ax_var[var_name].plot(var, selected_fmt, **properties_dict)
            ax_var[var_name].set_ylabel(var_label, wrap=True)
            ax_var[var_name].yaxis.label.set_color(selected_fmt[0])
            ax_var[var_name].tick_params(axis='y', colors=selected_fmt[0])

    ax.tick_params(axis='x', rotation=45)
    if label in ds and 'long_name' in ds[label].attrs:
        ax.set_xlabel(ds[label].attrs['long_name'])
    figure.tight_layout()

    if file:
        figure.savefig(file, dpi=600)

    return figure if not in_notebook() else None
Exemplo n.º 17
0
 def test_format(self):
     actual = VarNamesLike.format(HTML('abc'))
     self.assertIsInstance(actual, str)
     self.assertEqual(actual, 'abc')
Exemplo n.º 18
0
    def test_make_local_and_update(self):

        soilmoisture_data_sources = self.data_store.query(
            query_expr=
            'esacci.SOILMOISTURE.day.L3S.SSMV.multi-sensor.multi-platform.COMBINED.02-1.r1'
        )
        soilmoisture_data_source = soilmoisture_data_sources[0]

        reference_path = os.path.join(
            os.path.dirname(__file__),
            os.path.normpath('resources/datasources/local/files/'))

        def find_files_mock(_, time_range):
            def build_file_item(item_name: str, date_from: datetime,
                                date_to: datetime, size: int):

                return [
                    item_name, date_from, date_to, size, {
                        'OPENDAP':
                        os.path.join(reference_path, item_name),
                        'HTTPServer':
                        'file:' + urllib.request.pathname2url(
                            os.path.join(reference_path, item_name))
                    }
                ]

            reference_files = {
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781114000000-fv02.2.nc':
                {
                    'date_from': datetime.datetime(1978, 11, 14, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 14, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781115000000-fv02.2.nc':
                {
                    'date_from': datetime.datetime(1978, 11, 15, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 15, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781116000000-fv02.2.nc':
                {
                    'date_from': datetime.datetime(1978, 11, 16, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 16, 23, 59),
                    'size': 21511378
                }
            }

            reference_files_list = []

            for reference_file in reference_files.items():
                file_name = reference_file[0]
                file_date_from = reference_file[1].get('date_from')
                file_date_to = reference_file[1].get('date_to')
                file_size = reference_file[1].get('size')
                if time_range:
                    if file_date_from >= time_range[
                            0] and file_date_to <= time_range[1]:
                        reference_files_list.append(
                            build_file_item(file_name, file_date_from,
                                            file_date_to, file_size))
                else:
                    reference_files_list.append(
                        build_file_item(file_name, file_date_from,
                                        file_date_to, file_size))
            return reference_files_list

        with unittest.mock.patch(
                'cate.ds.esa_cci_odp.EsaCciOdpDataSource._find_files',
                find_files_mock):
            with unittest.mock.patch.object(EsaCciOdpDataStore,
                                            'query',
                                            return_value=[]):

                new_ds_title = 'local_ds_test'
                new_ds_time_range = TimeRangeLike.convert(
                    (datetime.datetime(1978, 11, 14, 0, 0),
                     datetime.datetime(1978, 11, 16, 23, 59)))
                try:
                    new_ds = soilmoisture_data_source.make_local(
                        new_ds_title, time_range=new_ds_time_range)
                except:
                    raise ValueError(reference_path,
                                     os.listdir(reference_path))
                self.assertIsNotNone(new_ds)

                self.assertEqual(new_ds.id, "local.%s" % new_ds_title)
                self.assertEqual(new_ds.temporal_coverage(), new_ds_time_range)

                new_ds_w_one_variable_title = 'local_ds_test_var'
                new_ds_w_one_variable_time_range = TimeRangeLike.convert(
                    (datetime.datetime(1978, 11, 14, 0, 0),
                     datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_one_variable_var_names = VarNamesLike.convert(['sm'])

                new_ds_w_one_variable = soilmoisture_data_source.make_local(
                    new_ds_w_one_variable_title,
                    time_range=new_ds_w_one_variable_time_range,
                    var_names=new_ds_w_one_variable_var_names)
                self.assertIsNotNone(new_ds_w_one_variable)

                self.assertEqual(new_ds_w_one_variable.id,
                                 "local.%s" % new_ds_w_one_variable_title)
                ds = new_ds_w_one_variable.open_dataset()

                new_ds_w_one_variable_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(ds.variables),
                                    set(new_ds_w_one_variable_var_names))

                new_ds_w_region_title = 'from_local_to_local_region'
                new_ds_w_region_time_range = TimeRangeLike.convert(
                    (datetime.datetime(1978, 11, 14, 0, 0),
                     datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_spatial_coverage = PolygonLike.convert(
                    "10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    region=new_ds_w_region_spatial_coverage
                )  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id,
                                 "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(),
                                 new_ds_w_region_spatial_coverage)

                new_ds_w_region_title = 'from_local_to_local_region_one_var'
                new_ds_w_region_time_range = TimeRangeLike.convert(
                    (datetime.datetime(1978, 11, 14, 0, 0),
                     datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_var_names = VarNamesLike.convert(['sm'])
                new_ds_w_region_spatial_coverage = PolygonLike.convert(
                    "10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    var_names=new_ds_w_region_var_names,
                    region=new_ds_w_region_spatial_coverage
                )  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id,
                                 "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(),
                                 new_ds_w_region_spatial_coverage)
                data_set = new_ds_w_region.open_dataset()
                new_ds_w_region_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(data_set.variables),
                                    set(new_ds_w_region_var_names))

                new_ds_w_region_title = 'from_local_to_local_region_two_var_sm_uncertainty'
                new_ds_w_region_time_range = TimeRangeLike.convert(
                    (datetime.datetime(1978, 11, 14, 0, 0),
                     datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_var_names = VarNamesLike.convert(
                    ['sm', 'sm_uncertainty'])
                new_ds_w_region_spatial_coverage = PolygonLike.convert(
                    "10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    var_names=new_ds_w_region_var_names,
                    region=new_ds_w_region_spatial_coverage
                )  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id,
                                 "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(),
                                 new_ds_w_region_spatial_coverage)
                data_set = new_ds_w_region.open_dataset()
                new_ds_w_region_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(data_set.variables),
                                    set(new_ds_w_region_var_names))

                empty_ds_timerange = (datetime.datetime(2017, 12, 1, 0, 0),
                                      datetime.datetime(2017, 12, 31, 23, 59))
                with self.assertRaises(DataAccessError) as cm:
                    soilmoisture_data_source.make_local(
                        'empty_ds', time_range=empty_ds_timerange)
                self.assertEqual(
                    "Open Data Portal's data source '{}' does not seem to have any data sets in given "
                    "time range {}".format(
                        soilmoisture_data_source.id,
                        TimeRangeLike.format(empty_ds_timerange)),
                    str(cm.exception))

                new_ds_time_range = TimeRangeLike.convert(
                    (datetime.datetime(1978, 11, 14, 0, 0),
                     datetime.datetime(1978, 11, 14, 23, 59)))

                new_ds = soilmoisture_data_source.make_local(
                    "title_test_copy", time_range=new_ds_time_range)
                self.assertIsNotNone(new_ds)
                self.assertEqual(new_ds.meta_info['title'],
                                 soilmoisture_data_source.meta_info['title'])

                title = "Title Test!"
                new_ds = soilmoisture_data_source.make_local(
                    "title_test_set", title, time_range=new_ds_time_range)
                self.assertIsNotNone(new_ds)
                self.assertEqual(new_ds.meta_info['title'], title)
Exemplo n.º 19
0
def detect_outliers(ds: xr.Dataset,
                    var: VarNamesLike.TYPE,
                    threshold_low: float = 0.05,
                    threshold_high: float = 0.95,
                    quantiles: bool = True,
                    mask: bool = False,
                    monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Detect outliers in the given Dataset.

    When mask=True the input dataset should not contain nan values, otherwise
    all existing nan values will be marked as 'outliers' in the mask data array
    added to the output dataset.

    :param ds: The dataset or dataframe for which to do outlier detection
    :param var: Variable or variables in the dataset to which to do outlier
    detection. Note that when multiple variables are selected, absolute
    threshold values might not make much sense. Wild cards can be used to
    select multiple variables matching a pattern.
    :param threshold_low: Values less or equal to this will be removed/masked
    :param threshold_high: Values greater or equal to this will be removed/masked
    :param quantiles: If True, threshold values are treated as quantiles,
    otherwise as absolute values.
    :param mask: If True, an ancillary variable containing flag values for
    outliers will be added to the dataset. Otherwise, outliers will be replaced
    with nan directly in the data variables.
    :param monitor: A progress monitor.
    :return: The dataset with outliers masked or replaced with nan
    """
    ds = DatasetLike.convert(ds)
    # Create a list of variable names on which to perform outlier detection
    # based on the input comma separated list that can contain wildcards
    var_patterns = VarNamesLike.convert(var)
    all_vars = list(ds.data_vars.keys())
    variables = list()
    for pattern in var_patterns:
        leave = fnmatch.filter(all_vars, pattern)
        variables = variables + leave

    # For each array in the dataset for which we should detect outliers, detect
    # outliers
    ret_ds = ds.copy()
    with monitor.starting("detect_outliers", total_work=len(variables) * 3):
        for var_name in variables:
            if quantiles:
                # Get threshold values
                with monitor.child(1).observing("quantile low"):
                    threshold_low = ret_ds[var_name].quantile(threshold_low)
                with monitor.child(1).observing("quantile high"):
                    threshold_high = ret_ds[var_name].quantile(threshold_high)
            else:
                monitor.progress(2)
            # If not mask, put nans in the data arrays for min/max outliers
            if not mask:
                arr = ret_ds[var_name]
                attrs = arr.attrs
                ret_ds[var_name] = arr.where((arr > threshold_low)
                                             & (arr < threshold_high))
                ret_ds[var_name].attrs = attrs
            else:
                # Create and add a data variable containing the mask for this data
                # variable
                _mask_outliers(ret_ds, var_name, threshold_low, threshold_high)
            monitor.progress(1)

    return ret_ds
Exemplo n.º 20
0
def plot_line(ds: DatasetLike.TYPE,
              var_names: VarNamesLike.TYPE,
              fmt: str = None,
              label: DimName.TYPE = None,
              indexers: DictLike.TYPE = None,
              title: str = None,
              file: str = None) -> Figure:
    """
    Create a 1D/line plot of variable(s) given by dataset *ds* and variable name(s) *var_names*.

    :param ds: Dataset or Dataframe that contains the variable(s) named by *var_names*.
    :param var_names: The name of the variable(s) to plot
    :param fmt: optional semicolon-separated matplotlib formats,
           e.g.
           1 variable - "b.-"
           2 variables - "b.-;r+:"
           If the number of properties is less than the number of selected variables, the next non-corresponding
           variable will repeat the first style on the list, and so on.
           For full reference on matplotlib plot() function, refer to
           https://matplotlib.org/api/_as_gen/matplotlib.pyplot.plot.html
    :param file: path to a file in which to save the plot
    :param label: dimension name to be selected as the x-axis of the plot
    :param indexers: Optional indexers into data array of *var_names*. The *indexers* is a dictionary
           or a comma-separated string of key-value pairs that maps the variable's dimension names
           to constant labels. e.g. "lat=12.4, time='2012-05-02'".
    :param title: an optional plot title
    :return: a matplotlib figure object or None if in IPython mode
    """
    ds = DatasetLike.convert(ds)

    fmt_count = 0
    fmt_list = []

    if fmt:
        fmt_list = fmt.split(";")
        fmt_count = len(fmt_list)

    if not var_names:
        raise ValidationError("Missing name for 'vars'")

    figure = plt.figure()
    ax = figure.add_subplot(111)
    figure.subplots_adjust(right=0.65)

    var_names = VarNamesLike.convert(var_names)
    if not title:
        if label:
            title = ','.join(var_names) + ' over ' + label
        else:
            title = ','.join(var_names)
    if indexers:
        title = title + '\n' + ' at ' + json.dumps(indexers).strip('"')
    ax.set_title(title)

    indexers = DictLike.convert(indexers)

    ax_var = {}
    var_count = len(var_names)
    predefined_fmt = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
    if label:
        ds = get_vars_data(ds, indexers, remaining_dims=[label])
    else:
        ds = get_vars_data(ds, indexers)

    for i in range(var_count):
        var_name = var_names[i]
        var = ds[var_name]
        if len(var.dims) > 1:
            raise ValidationError(
                f'Unable to plot because variable {var_name} has more than one dimension: {var.dims}.'
                f' To specify value(s) of these dimension(s), please use the indexers.'
            )

        var_label = var_name + ' (' + var.attrs[
            'units'] + ')' if 'units' in var.attrs else var_name
        properties_dict = {}

        indexers = DictLike.convert(indexers)

        if fmt is None:
            selected_fmt = predefined_fmt[i % len(predefined_fmt)]
        else:
            selected_fmt = fmt_list[i % fmt_count]

        if label:
            x_axis = var[label]
        elif 'time' in var:
            x_axis = var.time
        else:
            x_axis = []
        # to differentiate the creation of y-axis of the first and the nth variable
        if i == 0:
            if len(x_axis) > 0:
                ax.plot(x_axis, var, selected_fmt, **properties_dict)
            else:
                ax.plot(var, selected_fmt, **properties_dict)
            ax.set_ylabel(var_label, wrap=True)
            ax.yaxis.label.set_color(selected_fmt[0])
            ax.tick_params(axis='y', colors=selected_fmt[0])
        else:
            ax_var[var_name] = ax.twinx()
            if len(ax_var) > 1:
                ax_var[var_name].spines["right"].set_position(
                    ("axes", 1 + ((i - 1) * 0.2)))
                ax_var[var_name].set_frame_on(True)
                ax_var[var_name].patch.set_visible(False)
            if len(x_axis) > 0:
                ax_var[var_name].plot(x_axis, var, selected_fmt,
                                      **properties_dict)
            else:
                ax_var[var_name].plot(var, selected_fmt, **properties_dict)
            ax_var[var_name].set_ylabel(var_label, wrap=True)
            ax_var[var_name].yaxis.label.set_color(selected_fmt[0])
            ax_var[var_name].tick_params(axis='y', colors=selected_fmt[0])

    ax.tick_params(axis='x', rotation=45)
    if label in ds and 'long_name' in ds[label].attrs:
        ax.set_xlabel(ds[label].attrs['long_name'])
    figure.tight_layout()

    if file:
        figure.savefig(file, dpi=600)

    return figure if not in_notebook() else None
Exemplo n.º 21
0
 def test_format(self):
     self.assertEqual(VarNamesLike.format(['aa', 'bb', 'cc']), "aa, bb, cc")
     self.assertEqual(VarNamesLike.format(['aa']), "aa")
     self.assertEqual(VarNamesLike.format([]), "")
     self.assertEqual(VarNamesLike.format(None), "")
Exemplo n.º 22
0
    def open_dataset(self,
                     time_range: TimeRangeLike.TYPE = None,
                     region: PolygonLike.TYPE = None,
                     var_names: VarNamesLike.TYPE = None,
                     protocol: str = None,
                     monitor: Monitor = Monitor.NONE) -> Any:
        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        paths = []
        if time_range:
            time_series = list(self._files.values())
            file_paths = list(self._files.keys())
            for i in range(len(time_series)):
                if time_series[i]:
                    if isinstance(time_series[i], Tuple) and \
                            time_series[i][0] >= time_range[0] and \
                            time_series[i][1] <= time_range[1]:
                        paths.extend(self._resolve_file_path(file_paths[i]))
                    elif isinstance(
                            time_series[i], datetime
                    ) and time_range[0] <= time_series[i] < time_range[1]:
                        paths.extend(self._resolve_file_path(file_paths[i]))
        else:
            for file in self._files.items():
                paths.extend(self._resolve_file_path(file[0]))

        if not paths:
            raise self._empty_error(time_range)

        paths = sorted(set(paths))
        try:
            excluded_variables = self._meta_info.get('exclude_variables')
            if excluded_variables:
                drop_variables = [
                    variable.get('name') for variable in excluded_variables
                ]
            else:
                drop_variables = None
            # TODO: combine var_names and drop_variables
            return open_xarray_dataset(paths,
                                       region=region,
                                       var_names=var_names,
                                       drop_variables=drop_variables,
                                       monitor=monitor)
        except HTTPError as e:
            raise self._cannot_access_error(time_range,
                                            region,
                                            var_names,
                                            verb="open",
                                            cause=e) from e
        except (URLError, socket.timeout) as e:
            raise self._cannot_access_error(time_range,
                                            region,
                                            var_names,
                                            verb="open",
                                            cause=e,
                                            error_cls=NetworkError) from e
        except OSError as e:
            raise self._cannot_access_error(time_range,
                                            region,
                                            var_names,
                                            verb="open",
                                            cause=e) from e
Exemplo n.º 23
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({'zlib': True, 'complevel': compression_level})

        local_path = os.path.join(local_ds.data_store.data_store_path, local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path, local_relative_filepath)

            remote_absolute_filepath = os.path.join(self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                if not time_range or time_coverage_start >= time_range[0] and time_coverage_end <= time_range[1]:
                    if region or var_names:

                        do_update_of_variables_meta_info_once = True
                        do_update_of_region_meta_info_once = True

                        remote_dataset = None
                        try:
                            remote_dataset = xr.open_dataset(remote_absolute_filepath)

                            if var_names:
                                remote_dataset = remote_dataset.drop(
                                    [var_name for var_name in remote_dataset.data_vars.keys()
                                     if var_name not in var_names])

                            if region:
                                remote_dataset = normalize_impl(remote_dataset)
                                remote_dataset = adjust_spatial_attrs_impl(subset_spatial_impl(remote_dataset, region),
                                                                           allow_point=False)

                                if do_update_of_region_meta_info_once:
                                    # subset_spatial_impl
                                    local_ds.meta_info['bbox_maxx'] = remote_dataset.attrs['geospatial_lon_max']
                                    local_ds.meta_info['bbox_minx'] = remote_dataset.attrs['geospatial_lon_min']
                                    local_ds.meta_info['bbox_maxy'] = remote_dataset.attrs['geospatial_lat_max']
                                    local_ds.meta_info['bbox_miny'] = remote_dataset.attrs['geospatial_lat_min']
                                    do_update_of_region_meta_info_once = False

                            if compression_enabled:
                                for sel_var_name in remote_dataset.variables.keys():
                                    remote_dataset.variables.get(sel_var_name).encoding.update(encoding_update)

                            remote_dataset.to_netcdf(local_absolute_filepath)

                            child_monitor.progress(work=1, msg=str(time_coverage_start))
                        finally:
                            if do_update_of_variables_meta_info_once and remote_dataset is not None:
                                variables_info = local_ds.meta_info.get('variables', [])
                                local_ds.meta_info['variables'] = [var_info for var_info in variables_info
                                                                   if var_info.get('name')
                                                                   in remote_dataset.variables.keys()
                                                                   and var_info.get('name')
                                                                   not in remote_dataset.dims.keys()]
                                # noinspection PyUnusedLocal
                                do_update_of_variables_meta_info_once = False

                            local_ds.add_dataset(os.path.join(local_id, file_name),
                                                 (time_coverage_start, time_coverage_end))

                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath, local_absolute_filepath)
                        local_ds.add_dataset(local_relative_filepath, (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Exemplo n.º 24
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path,
                                                   local_relative_filepath)

            remote_absolute_filepath = os.path.join(
                self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                if not time_range or time_coverage_start >= time_range[
                        0] and time_coverage_end <= time_range[1]:
                    if region or var_names:

                        do_update_of_variables_meta_info_once = True
                        do_update_of_region_meta_info_once = True

                        remote_dataset = None
                        try:
                            remote_dataset = xr.open_dataset(
                                remote_absolute_filepath)

                            if var_names:
                                remote_dataset = remote_dataset.drop([
                                    var_name for var_name in
                                    remote_dataset.data_vars.keys()
                                    if var_name not in var_names
                                ])

                            if region:
                                remote_dataset = normalize_impl(remote_dataset)
                                remote_dataset = adjust_spatial_attrs_impl(
                                    subset_spatial_impl(
                                        remote_dataset, region),
                                    allow_point=False)

                                if do_update_of_region_meta_info_once:
                                    # subset_spatial_impl
                                    local_ds.meta_info[
                                        'bbox_maxx'] = remote_dataset.attrs[
                                            'geospatial_lon_max']
                                    local_ds.meta_info[
                                        'bbox_minx'] = remote_dataset.attrs[
                                            'geospatial_lon_min']
                                    local_ds.meta_info[
                                        'bbox_maxy'] = remote_dataset.attrs[
                                            'geospatial_lat_max']
                                    local_ds.meta_info[
                                        'bbox_miny'] = remote_dataset.attrs[
                                            'geospatial_lat_min']
                                    do_update_of_region_meta_info_once = False

                            if compression_enabled:
                                for sel_var_name in remote_dataset.variables.keys(
                                ):
                                    remote_dataset.variables.get(
                                        sel_var_name).encoding.update(
                                            encoding_update)

                            remote_dataset.to_netcdf(local_absolute_filepath)

                            child_monitor.progress(
                                work=1, msg=str(time_coverage_start))
                        finally:
                            if do_update_of_variables_meta_info_once and remote_dataset is not None:
                                variables_info = local_ds.meta_info.get(
                                    'variables', [])
                                local_ds.meta_info['variables'] = [
                                    var_info for var_info in variables_info
                                    if var_info.get('name') in remote_dataset.
                                    variables.keys() and var_info.get('name')
                                    not in remote_dataset.dims.keys()
                                ]
                                # noinspection PyUnusedLocal
                                do_update_of_variables_meta_info_once = False

                            local_ds.add_dataset(
                                os.path.join(local_id, file_name),
                                (time_coverage_start, time_coverage_end))

                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath,
                                    local_absolute_filepath)
                        local_ds.add_dataset(
                            local_relative_filepath,
                            (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Exemplo n.º 25
0
Arquivo: io.py Projeto: pwambach/cate
def write_csv(obj: DataFrameLike.TYPE,
              file: FileLike.TYPE,
              columns: VarNamesLike.TYPE = None,
              na_rep: str = '',
              delimiter: str = ',',
              quotechar: str = None,
              more_args: DictLike.TYPE = None,
              monitor: Monitor = Monitor.NONE):
    """
    Write comma-separated values (CSV) to plain text file from a DataFrame or Dataset.

    :param obj: The object to write as CSV; must be a ``DataFrame`` or a ``Dataset``.
    :param file: The CSV file path.
    :param columns: The names of variables that should be converted to columns. If given,
           coordinate variables are included automatically.
    :param delimiter: Delimiter to use.
    :param na_rep: A string representation of a missing value (no-data value).
    :param quotechar: The character used to denote the start and end of a quoted item.
           Quoted items can include the delimiter and it will be ignored.
    :param more_args: Other optional keyword arguments.
           Please refer to Pandas documentation of ``pandas.to_csv()`` function.
    :param monitor: optional progress monitor
    """
    if obj is None:
        raise ValidationError('obj must not be None')

    columns = VarNamesLike.convert(columns)

    if isinstance(obj, pd.DataFrame):
        # The following code is needed, because Pandas treats any kw given in kwargs as being set, even if just None.
        kwargs = DictLike.convert(more_args)
        if kwargs is None:
            kwargs = {}
        if columns:
            kwargs.update(columns=columns)
        if delimiter:
            kwargs.update(sep=delimiter)
        if na_rep:
            kwargs.update(na_rep=na_rep)
        if quotechar:
            kwargs.update(quotechar=quotechar)
        with monitor.starting('Writing to CSV', 1):
            obj.to_csv(file, index_label='index', **kwargs)
            monitor.progress(1)
    elif isinstance(obj, xr.Dataset):
        var_names = [
            var_name for var_name in obj.data_vars
            if columns is None or var_name in columns
        ]
        dim_names = None
        data_vars = []
        for var_name in var_names:
            data_var = obj.data_vars[var_name]
            if dim_names is None:
                dim_names = data_var.dims
            elif dim_names != data_var.dims:
                raise ValidationError(
                    'Not all variables have the same dimensions. '
                    'Please select variables so that their dimensions are equal.'
                )
            data_vars.append(data_var)
        if dim_names is None:
            raise ValidationError(
                'None of the selected variables has a dimension.')

        coord_vars = []
        for dim_name in dim_names:
            if dim_name in obj.coords:
                coord_var = obj.coords[dim_name]
            else:
                coord_var = None
                for data_var in obj.coords.values():
                    if len(data_var.dims
                           ) == 1 and data_var.dims[0] == dim_name:
                        coord_var = data_var
                        break
                if coord_var is None:
                    raise ValueError(
                        f'No coordinate variable found for dimension "{dim_name}"'
                    )
            coord_vars.append(coord_var)
        coord_indexes = [range(len(coord_var)) for coord_var in coord_vars]
        num_coords = len(coord_vars)

        num_rows = 1
        for coord_var in coord_vars:
            num_rows *= len(coord_var)

        stream = open(file, 'w') if isinstance(file, str) else file
        try:
            # Write header row
            stream.write('index')
            for i in range(num_coords):
                stream.write(delimiter)
                stream.write(coord_vars[i].name)
            for data_var in data_vars:
                stream.write(delimiter)
                stream.write(data_var.name)
            stream.write('\n')

            with monitor.starting('Writing CSV', num_rows):
                row = 0
                for index in itertools.product(*coord_indexes):
                    # Write data row
                    stream.write(str(row))
                    for i in range(num_coords):
                        coord_value = coord_vars[i].values[index[i]]
                        stream.write(delimiter)
                        stream.write(str(coord_value))
                    for data_var in data_vars:
                        var_value = data_var.values[index]
                        stream.write(delimiter)
                        stream.write(str(var_value))
                    stream.write('\n')
                    monitor.progress(1)
                    row += 1
        finally:
            if isinstance(file, str):
                stream.close()

    elif obj is None:
        raise ValidationError('obj must not be None')
    else:
        raise ValidationError(
            'obj must be a pandas.DataFrame or a xarray.Dataset')
Exemplo n.º 26
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id
        time_range = TimeRangeLike.convert(time_range)
        var_names = VarNamesLike.convert(var_names)

        excluded_variables = get_exclude_variables_fix_known_issues(self.id)

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL', NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        do_update_of_verified_time_coverage_start_once = True
        verified_time_coverage_start = None
        verified_time_coverage_end = None

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({'zlib': True, 'complevel': compression_level})

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path, local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)
        if not selected_file_list:
            msg = 'CCI Open Data Portal data source "{}"\ndoes not seem to have any datasets'.format(self.id)
            if time_range is not None:
                msg += ' in given time range {}'.format(TimeRangeLike.format(time_range))
            raise DataAccessError(msg)
        try:
            if protocol == _ODP_PROTOCOL_OPENDAP:

                do_update_of_variables_meta_info_once = True
                do_update_of_region_meta_info_once = True

                files = self._get_urls_list(selected_file_list, protocol)
                monitor.start('Sync ' + self.id, total_work=len(files))
                for idx, dataset_uri in enumerate(files):
                    child_monitor = monitor.child(work=1)

                    file_name = os.path.basename(dataset_uri)
                    local_filepath = os.path.join(local_path, file_name)

                    time_coverage_start = selected_file_list[idx][1]
                    time_coverage_end = selected_file_list[idx][2]

                    child_monitor.start(label=file_name, total_work=1)

                    remote_dataset = xr.open_dataset(dataset_uri, drop_variables=[variable.get('name') for variable in
                                                                                  excluded_variables])
                    if var_names:
                        remote_dataset = remote_dataset.drop([var_name for var_name in remote_dataset.data_vars.keys()
                                                              if var_name not in var_names])

                    if region:
                        remote_dataset = normalize_impl(remote_dataset)
                        remote_dataset = adjust_spatial_attrs_impl(subset_spatial_impl(remote_dataset, region),
                                                                   allow_point=False)

                        if do_update_of_region_meta_info_once:
                            local_ds.meta_info['bbox_minx'] = remote_dataset.attrs['geospatial_lon_min']
                            local_ds.meta_info['bbox_maxx'] = remote_dataset.attrs['geospatial_lon_max']
                            local_ds.meta_info['bbox_maxy'] = remote_dataset.attrs['geospatial_lat_max']
                            local_ds.meta_info['bbox_miny'] = remote_dataset.attrs['geospatial_lat_min']
                            do_update_of_region_meta_info_once = False

                    if compression_enabled:
                        for sel_var_name in remote_dataset.variables.keys():
                            remote_dataset.variables.get(sel_var_name).encoding.update(encoding_update)

                    remote_dataset.to_netcdf(local_filepath)

                    child_monitor.progress(work=1, msg=str(time_coverage_start))

                    if do_update_of_variables_meta_info_once:
                        variables_info = local_ds.meta_info.get('variables', [])
                        local_ds.meta_info['variables'] = [var_info for var_info in variables_info
                                                           if var_info.get('name')
                                                           in remote_dataset.variables.keys() and
                                                           var_info.get('name')
                                                           not in remote_dataset.dims.keys()]
                        do_update_of_variables_meta_info_once = False

                    local_ds.add_dataset(os.path.join(local_id, file_name),
                                         (time_coverage_start, time_coverage_end))

                    if do_update_of_verified_time_coverage_start_once:
                        verified_time_coverage_start = time_coverage_start
                        do_update_of_verified_time_coverage_start_once = False
                    verified_time_coverage_end = time_coverage_end
                    child_monitor.done()
            else:
                outdated_file_list = []
                for file_rec in selected_file_list:
                    filename, _, _, file_size, url = file_rec
                    dataset_file = os.path.join(local_path, filename)
                    # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                    # ... outdated or incomplete or corrupted.
                    # JSON also includes "checksum" and "checksum_type" fields.
                    if not os.path.isfile(dataset_file) or (file_size and os.path.getsize(dataset_file) != file_size):
                        outdated_file_list.append(file_rec)

                if outdated_file_list:
                    with monitor.starting('Sync ' + self.id, len(outdated_file_list)):
                        bytes_to_download = sum([file_rec[3] for file_rec in outdated_file_list])
                        dl_stat = _DownloadStatistics(bytes_to_download)

                        file_number = 1

                        for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                            dataset_file = os.path.join(local_path, filename)
                            sub_monitor = monitor.child(work=1.0)

                            # noinspection PyUnusedLocal
                            def reporthook(block_number, read_size, total_file_size):
                                dl_stat.handle_chunk(read_size)
                                sub_monitor.progress(work=read_size, msg=str(dl_stat))

                            sub_monitor_msg = "file %d of %d" % (file_number, len(outdated_file_list))
                            with sub_monitor.starting(sub_monitor_msg, file_size):
                                urllib.request.urlretrieve(url[protocol], filename=dataset_file, reporthook=reporthook)
                            file_number += 1
                            local_ds.add_dataset(os.path.join(local_id, filename), (coverage_from, coverage_to))

                            if do_update_of_verified_time_coverage_start_once:
                                verified_time_coverage_start = coverage_from
                                do_update_of_verified_time_coverage_start_once = False
                            verified_time_coverage_end = coverage_to
        except (OSError, ValueError) as e:
            raise DataAccessError("Copying remote data source failed: {}".format(e), source=self) from e
        local_ds.meta_info['temporal_coverage_start'] = TimeLike.format(verified_time_coverage_start)
        local_ds.meta_info['temporal_coverage_end'] = TimeLike.format(verified_time_coverage_end)
        local_ds.meta_info['exclude_variables'] = excluded_variables
        local_ds.save(True)
Exemplo n.º 27
0
    def test_make_local(self):
        data_source = self._local_data_store.query('local_w_temporal')[0]

        with unittest.mock.patch.object(EsaCciOdpDataStore,
                                        'query',
                                        return_value=[]):
            new_ds_title = 'from_local_to_local'
            new_ds_time_range = TimeRangeLike.convert(
                (datetime.datetime(1978, 11, 14, 0,
                                   0), datetime.datetime(1978, 11, 15, 23,
                                                         59)))
            new_ds = data_source.make_local(new_ds_title,
                                            time_range=new_ds_time_range)
            self.assertIsNotNone(new_ds)

            self.assertEqual(new_ds.id, "local.%s" % new_ds_title)
            self.assertEqual(
                new_ds.temporal_coverage(),
                TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                       datetime.datetime(1978, 11, 15, 23,
                                                         59))))

            new_ds_2_title = 'from_local_to_local_var'
            new_ds_2_time_range = TimeRangeLike.convert(
                (datetime.datetime(1978, 11, 14, 0,
                                   0), datetime.datetime(1978, 11, 15, 23,
                                                         59)))
            new_ds_2_vars = VarNamesLike.convert(['sm'])

            new_ds_w_one_variable = data_source.make_local(
                new_ds_2_title,
                time_range=new_ds_2_time_range,
                var_names=new_ds_2_vars)
            self.assertIsNotNone(new_ds_w_one_variable)
            self.assertEqual(new_ds_w_one_variable.id,
                             "local.%s" % new_ds_2_title)
            data_set = new_ds_w_one_variable.open_dataset()
            self.assertSetEqual(set(data_set.variables),
                                {'sm', 'lat', 'lon', 'time'})

            new_ds_3_title = 'from_local_to_local_range'
            new_ds_3_time_range = TimeRangeLike.convert(
                (datetime.datetime(1978, 11, 14, 0,
                                   0), datetime.datetime(1978, 11, 15, 23,
                                                         59)))
            new_ds_3_vars = VarNamesLike.convert(['sm'])
            new_ds_3_region = PolygonLike.convert("10,10,20,20")

            new_ds_w_region = data_source.make_local(
                new_ds_3_title,
                time_range=new_ds_3_time_range,
                var_names=new_ds_3_vars,
                region=new_ds_3_region)  # type: LocalDataSource
            self.assertIsNotNone(new_ds_w_region)
            self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_3_title)
            self.assertEqual(new_ds_w_region.spatial_coverage(),
                             PolygonLike.convert("10,10,20,20"))
            data_set = new_ds_w_region.open_dataset()
            self.assertSetEqual(set(data_set.variables),
                                {'sm', 'lat', 'lon', 'time'})

            no_data = data_source.make_local(
                'no_data',
                time_range=(datetime.datetime(2020, 11, 14, 0, 0),
                            datetime.datetime(2020, 11, 15, 23, 59)))
            self.assertIsNone(no_data)
Exemplo n.º 28
0
    def make_local(self,
                   local_name: str,
                   local_id: str = None,
                   time_range: TimeRangeLike.TYPE = None,
                   region: PolygonLike.TYPE = None,
                   var_names: VarNamesLike.TYPE = None,
                   monitor: Monitor = Monitor.NONE) -> Optional[DataSource]:

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(var_names) if var_names else None

        ds_id = local_name
        title = local_id

        local_store = DATA_STORE_REGISTRY.get_data_store('local')
        if not local_store:
            add_to_data_store_registry()
            local_store = DATA_STORE_REGISTRY.get_data_store('local')
        if not local_store:
            raise ValueError('Cannot initialize `local` DataStore')

        uuid = LocalDataStore.generate_uuid(ref_id=self.id, time_range=time_range, region=region, var_names=var_names)

        if not ds_id or len(ds_id) == 0:
            ds_id = "local.{}.{}".format(self.id, uuid)
            existing_ds_list = local_store.query(ds_id=ds_id)
            if len(existing_ds_list) == 1:
                return existing_ds_list[0]
        else:
            existing_ds_list = local_store.query(ds_id='local.%s' % ds_id)
            if len(existing_ds_list) == 1:
                if existing_ds_list[0].meta_info.get('uuid', None) == uuid:
                    return existing_ds_list[0]
                else:
                    raise ValueError('Datastore {} already contains dataset {}'.format(local_store.id, ds_id))

        local_meta_info = self.meta_info.copy()
        local_meta_info['ref_uuid'] = local_meta_info.get('uuid', None)
        local_meta_info['uuid'] = uuid

        local_ds = local_store.create_data_source(ds_id, title=title,
                                                  time_range=time_range, region=region, var_names=var_names,
                                                  meta_info=local_meta_info, lock_file=True)
        if local_ds:
            if not local_ds.is_complete:
                try:
                    self._make_local(local_ds, time_range, region, var_names, monitor=monitor)
                except Cancellation as c:
                    local_store.remove_data_source(local_ds)
                    raise c
                except Exception as e:
                    if local_ds.is_empty:
                        local_store.remove_data_source(local_ds)
                    raise e

            if local_ds.is_empty:
                local_store.remove_data_source(local_ds)
                return None

            local_store.register_ds(local_ds)
            return local_ds
        else:
            return None
Exemplo n.º 29
0
    def _make_local(self,
                    local_ds: 'LocalDataSource',
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        local_id = local_ds.id

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        data_store_path = local_ds.data_store.data_store_path
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        monitor.start("Sync " + self.id, total_work=len(self._files.items()))
        for remote_relative_filepath, coverage in self._files.items():
            child_monitor = monitor.child(work=1)

            file_name = os.path.basename(remote_relative_filepath)
            local_relative_filepath = os.path.join(local_id, file_name)
            local_absolute_filepath = os.path.join(data_store_path,
                                                   local_relative_filepath)

            remote_absolute_filepath = os.path.join(
                self._data_store.data_store_path, remote_relative_filepath)

            if isinstance(coverage, Tuple):

                time_coverage_start = coverage[0]
                time_coverage_end = coverage[1]

                remote_netcdf = None
                local_netcdf = None
                if not time_range or time_coverage_start >= time_range[
                        0] and time_coverage_end <= time_range[1]:
                    if region or var_names:
                        try:
                            remote_netcdf = NetCDF4DataStore(
                                remote_absolute_filepath)

                            local_netcdf = NetCDF4DataStore(
                                local_absolute_filepath,
                                mode='w',
                                persist=True)
                            local_netcdf.set_attributes(
                                remote_netcdf.get_attrs())

                            remote_dataset = xr.Dataset.load_store(
                                remote_netcdf)

                            geo_lat_min = None
                            geo_lat_max = None
                            geo_lon_min = None
                            geo_lon_max = None

                            process_region = False
                            if region:
                                geo_lat_min = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lat_min')
                                geo_lat_max = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lat_max')
                                geo_lon_min = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lon_min')
                                geo_lon_max = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs, 'geospatial_lon_max')

                                geo_lat_res = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs,
                                    'geospatial_lon_resolution')
                                geo_lon_res = self._get_harmonized_coordinate_value(
                                    remote_dataset.attrs,
                                    'geospatial_lat_resolution')
                                if not (isnan(geo_lat_min)
                                        or isnan(geo_lat_max)
                                        or isnan(geo_lon_min)
                                        or isnan(geo_lon_max)
                                        or isnan(geo_lat_res)
                                        or isnan(geo_lon_res)):
                                    process_region = True

                                    [lon_min, lat_min, lon_max,
                                     lat_max] = region.bounds

                                    descending_data_order = set()
                                    for var in remote_dataset.coords.keys():
                                        if remote_dataset.coords[var][
                                                0] > remote_dataset.coords[
                                                    var][-1]:
                                            descending_data_order.add(var)

                                    if 'lat' not in descending_data_order:
                                        lat_min = lat_min - geo_lat_min
                                        lat_max = lat_max - geo_lat_min
                                    else:
                                        lat_min_copy = lat_min
                                        lat_min = geo_lat_max - lat_max
                                        lat_max = geo_lat_max - lat_min_copy

                                    if 'lon' not in descending_data_order:
                                        lon_min = lon_min - geo_lon_min
                                        lon_max = lon_max - geo_lon_min
                                    else:
                                        lon_min_copy = lon_min
                                        lon_min = geo_lon_max - lon_max
                                        lon_max = geo_lon_max - lon_min_copy

                                    lat_min = int(floor(lat_min / geo_lat_res))
                                    lat_max = int(ceil(lat_max / geo_lat_res))
                                    lon_min = int(floor(lon_min / geo_lon_res))
                                    lon_max = int(ceil(lon_max / geo_lon_res))

                                    remote_dataset = remote_dataset.isel(
                                        drop=False,
                                        lat=slice(lat_min, lat_max),
                                        lon=slice(lon_min, lon_max))
                                    if 'lat' not in descending_data_order:
                                        geo_lat_min_copy = geo_lat_min
                                        geo_lat_min = lat_min * geo_lat_res + geo_lat_min_copy
                                        geo_lat_max = lat_max * geo_lat_res + geo_lat_min_copy
                                    else:
                                        geo_lat_max_copy = geo_lat_max
                                        geo_lat_min = geo_lat_max_copy - lat_max * geo_lat_res
                                        geo_lat_max = geo_lat_max_copy - lat_min * geo_lat_res

                                    if 'lon' not in descending_data_order:
                                        geo_lon_min_copy = geo_lon_min
                                        geo_lon_min = lon_min * geo_lon_res + geo_lon_min_copy
                                        geo_lon_max = lon_max * geo_lon_res + geo_lon_min_copy
                                    else:
                                        geo_lon_max_copy = geo_lon_max
                                        geo_lon_min = geo_lon_max_copy - lon_max * geo_lon_res
                                        geo_lon_max = geo_lon_max_copy - lon_min * geo_lon_res

                            if not var_names:
                                var_names = [
                                    var_name for var_name in
                                    remote_netcdf.variables.keys()
                                ]
                            var_names.extend([
                                coord_name
                                for coord_name in remote_dataset.coords.keys()
                                if coord_name not in var_names
                            ])
                            child_monitor.start(label=file_name,
                                                total_work=len(var_names))
                            for sel_var_name in var_names:
                                var_dataset = remote_dataset.drop([
                                    var_name for var_name in
                                    remote_dataset.variables.keys()
                                    if var_name != sel_var_name
                                ])
                                if compression_enabled:
                                    var_dataset.variables.get(
                                        sel_var_name).encoding.update(
                                            encoding_update)
                                local_netcdf.store_dataset(var_dataset)
                                child_monitor.progress(work=1,
                                                       msg=sel_var_name)
                            if process_region:
                                local_netcdf.set_attribute(
                                    'geospatial_lat_min', geo_lat_min)
                                local_netcdf.set_attribute(
                                    'geospatial_lat_max', geo_lat_max)
                                local_netcdf.set_attribute(
                                    'geospatial_lon_min', geo_lon_min)
                                local_netcdf.set_attribute(
                                    'geospatial_lon_max', geo_lon_max)
                        finally:
                            if remote_netcdf:
                                remote_netcdf.close()
                            if local_netcdf:
                                local_netcdf.close()
                                local_ds.add_dataset(
                                    local_relative_filepath,
                                    (time_coverage_start, time_coverage_end))
                        child_monitor.done()
                    else:
                        shutil.copy(remote_absolute_filepath,
                                    local_absolute_filepath)
                        local_ds.add_dataset(
                            local_relative_filepath,
                            (time_coverage_start, time_coverage_end))
                        child_monitor.done()
        monitor.done()
        return local_id
Exemplo n.º 30
0
def long_term_average(ds: DatasetLike.TYPE,
                      var: VarNamesLike.TYPE = None,
                      monitor: Monitor = Monitor.NONE) -> xr.Dataset:
    """
    Perform long term average of the given dataset by doing a mean of monthly
    values over the time range covered by the dataset. E.g. it averages all
    January values, all February values, etc, to create a dataset with twelve
    time slices each containing a mean of respective monthly values.

    For further information on climatological datasets, see
    http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#climatological-statistics

    :param ds: A monthly dataset to average
    :param var: If given, only these variables will be preserved in the resulting dataset
    :param monitor: A progress monitor
    :return: A climatological long term average dataset
    """
    ds = DatasetLike.convert(ds)
    # Check if time dtype is what we want
    if 'datetime64[ns]' != ds.time.dtype:
        raise ValueError(
            'Long term average operation expects a dataset with the'
            ' time coordinate of type datetime64[ns], but received'
            ' {}. Running the normalize operation on this'
            ' dataset may help'.format(ds.time.dtype))

    # Check if we have a monthly dataset
    try:
        if ds.attrs['time_coverage_resolution'] != 'P1M':
            raise ValueError(
                'Long term average operation expects a monthly dataset'
                ' running temporal aggregation on this dataset'
                ' beforehand may help.')
    except KeyError:
        raise ValueError('Could not determine temporal resolution. Running'
                         ' the adjust_temporal_attrs operation beforehand may'
                         ' help.')

    var = VarNamesLike.convert(var)
    # Shallow
    retset = ds.copy()
    if var:
        retset = select_var(retset, var)

    time_min = pd.Timestamp(ds.time.values[0])
    time_max = pd.Timestamp(ds.time.values[-1])

    total_work = 100

    with monitor.starting('LTA', total_work=total_work):
        monitor.progress(work=0)
        step = total_work / 12
        kwargs = {'monitor': monitor, 'step': step}
        retset = retset.groupby('time.month',
                                squeeze=False).apply(_mean, **kwargs)

    # Make the return dataset CF compliant
    retset = retset.rename({'month': 'time'})
    retset['time'] = pd.date_range('{}-01-01'.format(time_min.year),
                                   freq='MS',
                                   periods=12)

    climatology_bounds = xr.DataArray(data=np.tile([time_min, time_max],
                                                   (12, 1)),
                                      dims=['time', 'nv'],
                                      name='climatology_bounds')
    retset['climatology_bounds'] = climatology_bounds
    retset.time.attrs = ds.time.attrs
    retset.time.attrs['climatology'] = 'climatology_bounds'

    for var in retset.data_vars:
        try:
            retset[var].attrs['cell_methods'] = \
                    retset[var].attrs['cell_methods'] + ' time: mean over years'
        except KeyError:
            retset[var].attrs['cell_methods'] = 'time: mean over years'

    return retset
Exemplo n.º 31
0
def data_frame_aggregate(df: DataFrameLike.TYPE,
                         var_names: VarNamesLike.TYPE = None,
                         aggregate_geometry: bool = False,
                         monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Aggregate columns into count, mean, median, sum, std, min, and max. Return a
    new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of
    the GeoDataFrame are to be aggregated. All geometries are merged union-like.

    The return data type will always be the same as the input data type.

    :param df: The (Geo)DataFrame to be analysed
    :param var_names: Variables to be aggregated ('None' uses all aggregatable columns)
    :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame
    :param monitor: Monitor for progress bar
    :return: returns either DataFrame or GeoDataFrame. Keeps input data type
    """
    vns = VarNamesLike.convert(var_names)

    df_is_geo = isinstance(df, gpd.GeoDataFrame)
    aggregations = ["count", "mean", "median", "sum", "std", "min", "max"]

    # Check var names integrity (aggregatable, exists in data frame)
    types_accepted_for_agg = ['float64', 'int64', 'bool']
    agg_columns = list(df.select_dtypes(include=types_accepted_for_agg).columns)

    if df_is_geo:
        agg_columns.append('geometry')

    columns = list(df.columns)

    if vns is None:
        vns = agg_columns

    diff = list(set(vns) - set(columns))
    if len(diff) > 0:
        raise ValidationError('Variable ' + ','.join(diff) + ' not in data frame!')

    diff = list(set(vns) - set(agg_columns))
    if len(diff) > 0:
        raise ValidationError('Variable(s) ' + ','.join(diff) + ' not aggregatable!')

    try:
        df['geometry']
    except KeyError as e:
        raise ValidationError('Variable geometry not in GEO data frame!') from e

    # Aggregate columns
    if vns is None:
        df_buff = df.select_dtypes(include=types_accepted_for_agg).agg(aggregations)
    else:
        df_buff = df[vns].select_dtypes(include=types_accepted_for_agg).agg(aggregations)

    res = {}
    for n in df_buff.columns:
        for a in aggregations:
            val = df_buff[n][a]
            h = n + '_' + a
            res[h] = [val]

    df_agg = pd.DataFrame(res)

    # Aggregate (union) geometry if GeoDataFrame
    if df_is_geo and aggregate_geometry:
        total_work = 100
        num_work_rows = 1 + len(df) // total_work
        with monitor.starting('Aggregating geometry: ', total_work):
            multi_polygon = shapely.geometry.MultiPolygon()
            i = 0
            for rec in df.geometry:
                if monitor.is_cancelled():
                    break
                # noinspection PyBroadException
                try:
                    multi_polygon = multi_polygon.union(other=rec)
                except Exception:
                    pass

                if i % num_work_rows == 0:
                    monitor.progress(work=1)
                i += 1

        df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs)

    return df_agg
Exemplo n.º 32
0
    def _make_local(self,
                    local_ds: LocalDataSource,
                    time_range: TimeRangeLike.TYPE = None,
                    region: PolygonLike.TYPE = None,
                    var_names: VarNamesLike.TYPE = None,
                    monitor: Monitor = Monitor.NONE):

        # local_name = local_ds.name
        local_id = local_ds.name

        time_range = TimeRangeLike.convert(time_range) if time_range else None
        region = PolygonLike.convert(region) if region else None
        var_names = VarNamesLike.convert(
            var_names) if var_names else None  # type: Sequence

        compression_level = get_config_value('NETCDF_COMPRESSION_LEVEL',
                                             NETCDF_COMPRESSION_LEVEL)
        compression_enabled = True if compression_level > 0 else False

        encoding_update = dict()
        if compression_enabled:
            encoding_update.update({
                'zlib': True,
                'complevel': compression_level
            })

        if region or var_names:
            protocol = _ODP_PROTOCOL_OPENDAP
        else:
            protocol = _ODP_PROTOCOL_HTTP

        local_path = os.path.join(local_ds.data_store.data_store_path,
                                  local_id)
        if not os.path.exists(local_path):
            os.makedirs(local_path)

        selected_file_list = self._find_files(time_range)

        if protocol == _ODP_PROTOCOL_OPENDAP:

            files = self._get_urls_list(selected_file_list, protocol)
            monitor.start('Sync ' + self.name, total_work=len(files))
            for idx, dataset_uri in enumerate(files):
                child_monitor = monitor.child(work=1)

                file_name = os.path.basename(dataset_uri)
                local_filepath = os.path.join(local_path, file_name)

                time_coverage_start = selected_file_list[idx][1]
                time_coverage_end = selected_file_list[idx][2]

                remote_netcdf = None
                local_netcdf = None
                try:
                    remote_netcdf = NetCDF4DataStore(dataset_uri)

                    local_netcdf = NetCDF4DataStore(local_filepath,
                                                    mode='w',
                                                    persist=True)
                    local_netcdf.set_attributes(remote_netcdf.get_attrs())

                    remote_dataset = xr.Dataset.load_store(remote_netcdf)

                    process_region = False
                    if region:
                        geo_lat_min = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_min')
                        geo_lat_max = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_max')
                        geo_lon_min = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_min')
                        geo_lon_max = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_max')

                        geo_lat_res = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lon_resolution')
                        geo_lon_res = self._get_harmonized_coordinate_value(
                            remote_dataset.attrs, 'geospatial_lat_resolution')
                        if not (isnan(geo_lat_min) or isnan(geo_lat_max)
                                or isnan(geo_lon_min) or isnan(geo_lon_max)
                                or isnan(geo_lat_res) or isnan(geo_lon_res)):
                            process_region = True

                            [lat_min, lon_min, lat_max,
                             lon_max] = region.bounds

                            lat_min = floor(
                                (lat_min - geo_lat_min) / geo_lat_res)
                            lat_max = ceil(
                                (lat_max - geo_lat_min) / geo_lat_res)
                            lon_min = floor(
                                (lon_min - geo_lon_min) / geo_lon_res)
                            lon_max = ceil(
                                (lon_max - geo_lon_min) / geo_lon_res)

                            # TODO (kbernat): check why dataset.sel fails!
                            remote_dataset = remote_dataset.isel(
                                drop=False,
                                lat=slice(lat_min, lat_max),
                                lon=slice(lon_min, lon_max))

                            geo_lat_max = lat_max * geo_lat_res + geo_lat_min
                            geo_lat_min += lat_min * geo_lat_res
                            geo_lon_max = lon_max * geo_lon_res + geo_lon_min
                            geo_lon_min += lon_min * geo_lon_res

                    if not var_names:
                        var_names = [
                            var_name
                            for var_name in remote_netcdf.variables.keys()
                        ]
                    var_names.extend([
                        coord_name
                        for coord_name in remote_dataset.coords.keys()
                        if coord_name not in var_names
                    ])
                    child_monitor.start(label=file_name,
                                        total_work=len(var_names))
                    for sel_var_name in var_names:
                        var_dataset = remote_dataset.drop([
                            var_name
                            for var_name in remote_dataset.variables.keys()
                            if var_name != sel_var_name
                        ])
                        if compression_enabled:
                            var_dataset.variables.get(
                                sel_var_name).encoding.update(encoding_update)
                        local_netcdf.store_dataset(var_dataset)
                        child_monitor.progress(work=1, msg=sel_var_name)
                    if process_region:
                        local_netcdf.set_attribute('geospatial_lat_min',
                                                   geo_lat_min)
                        local_netcdf.set_attribute('geospatial_lat_max',
                                                   geo_lat_max)
                        local_netcdf.set_attribute('geospatial_lon_min',
                                                   geo_lon_min)
                        local_netcdf.set_attribute('geospatial_lon_max',
                                                   geo_lon_max)

                finally:
                    if remote_netcdf:
                        remote_netcdf.close()
                    if local_netcdf:
                        local_netcdf.close()
                        local_ds.add_dataset(
                            os.path.join(local_id, file_name),
                            (time_coverage_start, time_coverage_end))

                child_monitor.done()
        else:
            outdated_file_list = []
            for file_rec in selected_file_list:
                filename, _, _, file_size, url = file_rec
                dataset_file = os.path.join(local_path, filename)
                # todo (forman, 20160915): must perform better checks on dataset_file if it is...
                # ... outdated or incomplete or corrupted.
                # JSON also includes "checksum" and "checksum_type" fields.
                if not os.path.isfile(dataset_file) or (
                        file_size
                        and os.path.getsize(dataset_file) != file_size):
                    outdated_file_list.append(file_rec)

            if outdated_file_list:
                with monitor.starting('Sync ' + self.name,
                                      len(outdated_file_list)):
                    bytes_to_download = sum(
                        [file_rec[3] for file_rec in outdated_file_list])
                    dl_stat = _DownloadStatistics(bytes_to_download)

                    file_number = 1

                    for filename, coverage_from, coverage_to, file_size, url in outdated_file_list:
                        if monitor.is_cancelled():
                            raise InterruptedError
                        dataset_file = os.path.join(local_path, filename)
                        sub_monitor = monitor.child(work=1.0)

                        # noinspection PyUnusedLocal
                        def reporthook(block_number, read_size,
                                       total_file_size):
                            dl_stat.handle_chunk(read_size)
                            if monitor.is_cancelled():
                                raise InterruptedError
                            sub_monitor.progress(work=read_size,
                                                 msg=str(dl_stat))

                        sub_monitor_msg = "file %d of %d" % (
                            file_number, len(outdated_file_list))
                        with sub_monitor.starting(sub_monitor_msg, file_size):
                            urllib.request.urlretrieve(url[protocol],
                                                       filename=dataset_file,
                                                       reporthook=reporthook)
                        file_number += 1
                        local_ds.add_dataset(os.path.join(local_id, filename),
                                             (coverage_from, coverage_to))
        local_ds.save()
        monitor.done()
Exemplo n.º 33
0
    def test_make_local_and_update(self):

        soilmoisture_data_sources = self.data_store.query(
            query_expr='esacci.SOILMOISTURE.day.L3S.SSMV.multi-sensor.multi-platform.COMBINED.02-1.r1')
        soilmoisture_data_source = soilmoisture_data_sources[0]

        reference_path = os.path.join(os.path.dirname(__file__),
                                      os.path.normpath('resources/datasources/local/files/'))

        def find_files_mock(_, time_range):

            def build_file_item(item_name: str, date_from: datetime, date_to: datetime, size: int):

                return [item_name, date_from, date_to, size,
                        {'OPENDAP': os.path.join(reference_path, item_name),
                         'HTTPServer': 'file:' + urllib.request.pathname2url(os.path.join(reference_path, item_name))}]

            reference_files = {
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781114000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 14, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 14, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781115000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 15, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 15, 23, 59),
                    'size': 21511378
                },
                'ESACCI-SOILMOISTURE-L3S-SSMV-COMBINED-19781116000000-fv02.2.nc': {
                    'date_from': datetime.datetime(1978, 11, 16, 0, 0),
                    'date_to': datetime.datetime(1978, 11, 16, 23, 59),
                    'size': 21511378
                }
            }

            reference_files_list = []

            for reference_file in reference_files.items():
                file_name = reference_file[0]
                file_date_from = reference_file[1].get('date_from')
                file_date_to = reference_file[1].get('date_to')
                file_size = reference_file[1].get('size')
                if time_range:
                    if file_date_from >= time_range[0] and file_date_to <= time_range[1]:
                        reference_files_list.append(build_file_item(file_name,
                                                                    file_date_from,
                                                                    file_date_to,
                                                                    file_size))
                else:
                    reference_files_list.append(build_file_item(file_name,
                                                                file_date_from,
                                                                file_date_to,
                                                                file_size))
            return reference_files_list

        with unittest.mock.patch('cate.ds.esa_cci_odp.EsaCciOdpDataSource._find_files', find_files_mock):
            with unittest.mock.patch.object(EsaCciOdpDataStore, 'query', return_value=[]):

                new_ds_title = 'local_ds_test'
                new_ds_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                           datetime.datetime(1978, 11, 16, 23, 59)))
                try:
                    new_ds = soilmoisture_data_source.make_local(new_ds_title, time_range=new_ds_time_range)
                except Exception:
                    raise ValueError(reference_path, os.listdir(reference_path))
                self.assertIsNotNone(new_ds)

                self.assertEqual(new_ds.id, "local.%s" % new_ds_title)
                self.assertEqual(new_ds.temporal_coverage(), new_ds_time_range)

                new_ds_w_one_variable_title = 'local_ds_test_var'
                new_ds_w_one_variable_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                          datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_one_variable_var_names = VarNamesLike.convert(['sm'])

                new_ds_w_one_variable = soilmoisture_data_source.make_local(
                    new_ds_w_one_variable_title,
                    time_range=new_ds_w_one_variable_time_range,
                    var_names=new_ds_w_one_variable_var_names
                )
                self.assertIsNotNone(new_ds_w_one_variable)

                self.assertEqual(new_ds_w_one_variable.id, "local.%s" % new_ds_w_one_variable_title)
                ds = new_ds_w_one_variable.open_dataset()

                new_ds_w_one_variable_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(ds.variables),
                                    set(new_ds_w_one_variable_var_names))

                new_ds_w_region_title = 'from_local_to_local_region'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    region=new_ds_w_region_spatial_coverage)  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)

                new_ds_w_region_title = 'from_local_to_local_region_one_var'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_var_names = VarNamesLike.convert(['sm'])
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    var_names=new_ds_w_region_var_names,
                    region=new_ds_w_region_spatial_coverage)  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)
                data_set = new_ds_w_region.open_dataset()
                new_ds_w_region_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names))

                new_ds_w_region_title = 'from_local_to_local_region_two_var_sm_uncertainty'
                new_ds_w_region_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                                    datetime.datetime(1978, 11, 16, 23, 59)))
                new_ds_w_region_var_names = VarNamesLike.convert(['sm', 'sm_uncertainty'])
                new_ds_w_region_spatial_coverage = PolygonLike.convert("10,20,30,40")

                new_ds_w_region = soilmoisture_data_source.make_local(
                    new_ds_w_region_title,
                    time_range=new_ds_w_region_time_range,
                    var_names=new_ds_w_region_var_names,
                    region=new_ds_w_region_spatial_coverage)  # type: LocalDataSource

                self.assertIsNotNone(new_ds_w_region)

                self.assertEqual(new_ds_w_region.id, "local.%s" % new_ds_w_region_title)

                self.assertEqual(new_ds_w_region.spatial_coverage(), new_ds_w_region_spatial_coverage)
                data_set = new_ds_w_region.open_dataset()
                new_ds_w_region_var_names.extend(['lat', 'lon', 'time'])

                self.assertSetEqual(set(data_set.variables), set(new_ds_w_region_var_names))

                empty_ds_timerange = (datetime.datetime(2017, 12, 1, 0, 0), datetime.datetime(2017, 12, 31, 23, 59))
                with self.assertRaises(DataAccessError) as cm:
                    soilmoisture_data_source.make_local('empty_ds', time_range=empty_ds_timerange)
                self.assertEqual(f'Data source "{soilmoisture_data_source.id}" does not'
                                 f' seem to have any datasets in given'
                                 f' time range {TimeRangeLike.format(empty_ds_timerange)}',
                                 str(cm.exception))

                new_ds_time_range = TimeRangeLike.convert((datetime.datetime(1978, 11, 14, 0, 0),
                                                           datetime.datetime(1978, 11, 14, 23, 59)))

                new_ds = soilmoisture_data_source.make_local("title_test_copy", time_range=new_ds_time_range)
                self.assertIsNotNone(new_ds)
                self.assertEqual(new_ds.meta_info['title'], soilmoisture_data_source.meta_info['title'])

                title = "Title Test!"
                new_ds = soilmoisture_data_source.make_local("title_test_set", title, time_range=new_ds_time_range)
                self.assertIsNotNone(new_ds)
                self.assertEqual(new_ds.meta_info['title'], title)
Exemplo n.º 34
0
def data_frame_aggregate(df: DataFrameLike.TYPE,
                         var_names: VarNamesLike.TYPE = None,
                         aggregate_geometry: bool = False,
                         monitor: Monitor = Monitor.NONE) -> pd.DataFrame:
    """
    Aggregate columns into count, mean, median, sum, std, min, and max. Return a
    new (Geo)DataFrame with a single row containing all aggregated values. Specify whether the geometries of
    the GeoDataFrame are to be aggregated. All geometries are merged union-like.

    The return data type will always be the same as the input data type.

    :param df: The (Geo)DataFrame to be analysed
    :param var_names: Variables to be aggregated ('None' uses all aggregatable columns)
    :param aggregate_geometry: Aggregate (union like) the geometry and add it to the resulting GeoDataFrame
    :param monitor: Monitor for progress bar
    :return: returns either DataFrame or GeoDataFrame. Keeps input data type
    """
    vns = VarNamesLike.convert(var_names)

    df_is_geo = isinstance(df, gpd.GeoDataFrame)
    aggregations = ["count", "mean", "median", "sum", "std", "min", "max"]

    # Check var names integrity (aggregatable, exists in data frame)
    types_accepted_for_agg = ['float64', 'int64', 'bool']
    agg_columns = list(
        df.select_dtypes(include=types_accepted_for_agg).columns)

    if df_is_geo:
        agg_columns.append('geometry')

    columns = list(df.columns)

    if vns is None:
        vns = agg_columns

    diff = list(set(vns) - set(columns))
    if len(diff) > 0:
        raise ValidationError('Variable ' + ','.join(diff) +
                              ' not in data frame!')

    diff = list(set(vns) - set(agg_columns))
    if len(diff) > 0:
        raise ValidationError('Variable(s) ' + ','.join(diff) +
                              ' not aggregatable!')

    try:
        df['geometry']
    except KeyError as e:
        raise ValidationError(
            'Variable geometry not in GEO data frame!') from e

    # Aggregate columns
    if vns is None:
        df_buff = df.select_dtypes(
            include=types_accepted_for_agg).agg(aggregations)
    else:
        df_buff = df[vns].select_dtypes(
            include=types_accepted_for_agg).agg(aggregations)

    res = {}
    for n in df_buff.columns:
        for a in aggregations:
            val = df_buff[n][a]
            h = n + '_' + a
            res[h] = [val]

    df_agg = pd.DataFrame(res)

    # Aggregate (union) geometry if GeoDataFrame
    if df_is_geo and aggregate_geometry:
        total_work = 100
        num_work_rows = 1 + len(df) // total_work
        with monitor.starting('Aggregating geometry: ', total_work):
            multi_polygon = shapely.geometry.MultiPolygon()
            i = 0
            for rec in df.geometry:
                if monitor.is_cancelled():
                    break
                # noinspection PyBroadException
                try:
                    multi_polygon = multi_polygon.union(other=rec)
                except Exception:
                    pass

                if i % num_work_rows == 0:
                    monitor.progress(work=1)
                i += 1

        df_agg = gpd.GeoDataFrame(df_agg, geometry=[multi_polygon], crs=df.crs)

    return df_agg