Пример #1
0
    def _read_gridded(self, reader, var_name, is_model=True, **kwargs):
        try:
            start = kwargs.pop('start')
        except KeyError:
            start = self.start

        try:
            stop = kwargs.pop('stop')
        except KeyError:
            stop = self.stop
        if is_model:
            vert_which = self.obs_vert_type
            ts_type_read = self.model_ts_type_read
            if self.model_use_climatology:
                start = 9999
                stop = None
        else:
            vert_which = None
            ts_type_read = self.obs_ts_type_read

        try:
            # set defaults if input was not specified explicitely
            if ts_type_read is None and not self.flex_ts_type_gridded:
                ts_type_read = self.ts_type
            if not 'vert_which' in kwargs:
                kwargs['vert_which'] = vert_which
            if not 'ts_type' in kwargs:
                kwargs['ts_type'] = ts_type_read

            if isinstance(kwargs['ts_type'], dict):
                kwargs['ts_type'] = kwargs['ts_type'][var_name]

            return reader.read_var(var_name,
                                   start=start,
                                   stop=stop,
                                   flex_ts_type=self.flex_ts_type_gridded,
                                   **kwargs)
        except DataCoverageError:
            vt = None
            if is_model:
                if self.obs_vert_type in self.OBS_VERT_TYPES_ALT:
                    vt = self.OBS_VERT_TYPES_ALT[self.obs_vert_type]
                elif self.model_vert_type_alt is not None:
                    mva = self.model_vert_type_alt
                    if isinstance(mva, str):
                        vt = mva
                    elif isinstance(mva, dict) and var_name in mva:
                        vt = mva[var_name]

            if vt is None:
                raise DataCoverageError(
                    ('No data files available for dataset '
                     '{} ({})'.format(reader.data_id, var_name)))

            return reader.read_var(var_name,
                                   start=start,
                                   stop=stop,
                                   ts_type=ts_type_read,
                                   flex_ts_type=self.flex_ts_type_gridded,
                                   vert_which=vt)
Пример #2
0
    def read_ungridded(self, vars_to_read=None):
        """Helper to read UngriddedData

        Note
        ----
        Currently not used in main processing method
        :func:`_run_gridded_ungridded`. But should be.

        Parameters
        ----------
        vars_to_read : str or list, optional
            variables that should be read from obs-network (:attr:`obs_id`)

        Returns
        -------
        UngriddedData
            loaded data object

        """
        if isinstance(vars_to_read, str):
            vars_to_read = [vars_to_read]

        obs_reader = ReadUngridded(self.obs_id, data_dir=self.obs_data_dir)
        if vars_to_read is None:
            vars_to_read = self.obs_vars
        obs_vars = []
        for var in vars_to_read:
            if var in obs_reader.get_reader(self.obs_id).PROVIDES_VARIABLES:
                obs_vars.append(var)
            else:
                const.print_log.warning('Variable {} is not supported by {} '
                                        'and will be skipped'.format(
                                            var, self.obs_id))
        if len(obs_vars) == 0:
            raise DataCoverageError(
                'No observation variable matches found for '
                '{}'.format(self.obs_id))

        if self.read_opts_ungridded is not None:
            ropts = self.read_opts_ungridded
        else:
            ropts = {}
        obs_data = obs_reader.read(datasets_to_read=self.obs_id,
                                   vars_to_retrieve=obs_vars,
                                   **ropts)
        if 'obs_filters' in self:
            remaining_filters = self._eval_obs_filters()
            obs_data = obs_data.apply_filters(**remaining_filters)

        if self.remove_outliers:
            #self._update_var_outlier_ranges(obs_vars=obs_vars)
            for var in obs_vars:
                low, high = None, None
                try:
                    low, high = self.var_ref_outlier_ranges[var]
                except Exception:
                    pass

                obs_data.remove_outliers(var, inplace=True, low=low, high=high)
        return obs_data
Пример #3
0
    def read_model_data(self, var_name, **kwargs):
        """Read model variable data based on colocation setup

        Parameters
        ----------
        var_name : str
            variable to be read

        Returns
        -------
        GriddedData
            variable data
        """
        use_input_var = False
        if 'use_input_var' in kwargs:
            use_input_var = kwargs.pop('use_input_var')

        reader = self.instantiate_gridded_reader(what='model')
        if use_input_var:
            var = var_name
        else:
            try:
                var_matches = self._find_var_matches(var_name, reader)
            except DataCoverageError:
                raise DataCoverageError('No match could be found in {} for '
                                        'variable {}'.format(
                                            self.model_id, var_name))
            var = list(var_matches.keys())[0]
        return self._read_gridded(reader, var, is_model=True, **kwargs)
Пример #4
0
    def _find_var_matches(self, obs_vars, model_reader, var_name=None):
        """Find variable matches in model data for input obs variables"""
        var_matches = {}

        muv, mav = {}, {}
        if isinstance(self.model_use_vars, dict):
            muv = self.model_use_vars

        if isinstance(self.model_add_vars, dict):
            mav = self.model_add_vars

        for obs_var in obs_vars:
            if obs_var in muv:
                model_var = muv[obs_var]
            else:
                model_var = obs_var

            self._check_add_model_read_aux(model_var, model_reader)

            if model_reader.has_var(model_var):
                var_matches[model_var] = obs_var

            if obs_var in mav:  #observation variable
                model_add_var = mav[obs_var]
                self._check_add_model_read_aux(model_add_var, model_reader)
                if model_reader.has_var(model_add_var):
                    var_matches[model_add_var] = obs_var

        for obs_var, obs_var_altname in self.obs_vars_rename.items():
            if obs_var_altname in var_matches:
                raise AttributeError('{} match was already found for obs '
                                     'var to be renamed {}...'.format(
                                         obs_var_altname, obs_var))
            if model_reader.has_var(obs_var_altname):
                var_matches[obs_var_altname] = obs_var

        if var_name is not None:
            if isinstance(var_name, str):
                var_name = [var_name]
            if not isinstance(var_name, list):
                raise ValueError('Invalid input for var_name. Need str or '
                                 'list, got {}'.format(var_name))
            _var_matches = {}
            for mvar, ovar in var_matches.items():
                if mvar in var_name or ovar in var_name:
                    _var_matches[mvar] = ovar
            var_matches = _var_matches

        if len(var_matches) == 0:

            raise DataCoverageError('No variable matches between '
                                    '{} and {} for input vars: {}'.format(
                                        self.model_id, self.obs_id,
                                        self.obs_vars))
        return var_matches
Пример #5
0
    def _read_gridded(self, reader, var_name, start, stop, is_model=True):
        if is_model:
            vert_which = self.obs_vert_type
            if all(x == '' for x in reader.file_info.vert_code.values):
                print_log.info('Deactivating model file search by vertical '
                               'code for {}, since filenames do not include '
                               'information about vertical code (probably '
                               'AeroCom 2 convention)'.format(reader.data_id))
                vert_which = None
            ts_type_read = self.model_ts_type_read
            if self.model_use_climatology:
                start = 9999
                stop = None
        else:
            vert_which = None
            ts_type_read = self.obs_ts_type_read
        msg = ('No data files available for dataset {} ({})'.format(
            reader.data_id, var_name))
        try:
            return reader.read_var(var_name,
                                   start=start,
                                   stop=stop,
                                   ts_type=ts_type_read,
                                   flex_ts_type=self.flex_ts_type_gridded,
                                   vert_which=vert_which)
        except DataCoverageError:
            vt = None
            if is_model:
                if self.obs_vert_type in self.OBS_VERT_TYPES_ALT:
                    vt = self.OBS_VERT_TYPES_ALT[self.obs_vert_type]
                elif self.model_vert_type_alt is not None:
                    mva = self.model_vert_type_alt
                    if isinstance(mva, str):
                        vt = mva
                    elif isinstance(mva, dict) and var_name in mva:
                        vt = mva[var_name]

            if vt is None:
                raise DataCoverageError(msg)

            return reader.read_var(var_name,
                                   start=start,
                                   stop=stop,
                                   ts_type=ts_type_read,
                                   flex_ts_type=self.flex_ts_type_gridded,
                                   vert_which=vt)
Пример #6
0
def compute_trends_station(station, var_name, start_year=None, 
                           stop_year=None, season=None, slope_confidence=0.68,
                           **alt_range):
    # load additional information about data source (if applicable)
    if not 'trends' in station:
        station['trends'] = od()
    tr = station['trends']
    if not var_name in tr:
        station['trends'][var_name] = trv = TrendsEngine(var_name)
    else:
        trv = station['trends'][var_name]

    freq = station.get_var_ts_type(var_name)
    
    ts_types = const.GRID_IO.TS_TYPES
    
    if not trv.has_daily:
        if not freq in ts_types or (ts_types.index(freq) <= ts_types.index('daily')):
            trv['daily'] = station.to_timeseries(var_name, freq='daily', **alt_range)
    # monthly is mandatory
    if not trv.has_monthly:
        if freq in ts_types and ts_types.index(freq) >= ts_types.index('monthly'):
            raise TemporalResolutionError('Need monthly or higher')
        ms = station.to_timeseries(var_name, freq='monthly', **alt_range)
        trv['monthly'] = ms
    else:
        ms = trv['monthly']
        
    if len(ms) == 0 or all(np.isnan(ms)):
        raise DataCoverageError('Failed to retrieve monthly timeseries for '
                                '{} ({})'.format(station.station_name,
                                 var_name))
      
    if trv._mobs is None:
        trv._mobs = _make_mobs_dataframe(ms)
    
    result = trv.compute_trend(start_year, stop_year, season, 
                               slope_confidence)
    
    
    trv.meta.update(station.get_meta(add_none_vals=True))
    if var_name in station.var_info:
        trv.meta.update(station.var_info[var_name])
    return result
Пример #7
0
    def _find_var_matches(self, obs_vars, model_reader, var_name=None):
        """Find variable matches in model data for input obs variables"""
        if isinstance(obs_vars, str):
            obs_vars = [obs_vars]

        # dictionary that will map model variables (keys) with observation variables (values)
        var_matches = {}

        muv = self.model_use_vars if isinstance(self.model_use_vars,
                                                dict) else {}

        for obs_var in obs_vars:
            if obs_var in muv:
                model_var = muv[obs_var]
            else:
                model_var = obs_var

            try:
                self._check_add_model_read_aux(model_var, model_reader)

                if model_reader.has_var(model_var):
                    var_matches[model_var] = obs_var

                var_matches = self._check_model_add_var(
                    obs_var, model_reader, var_matches)
            except VariableDefinitionError:
                continue

        if var_name is not None:
            _var_matches = {}
            for mvar, ovar in var_matches.items():
                if mvar in var_name or ovar in var_name:
                    _var_matches[mvar] = ovar
            var_matches = _var_matches

        if len(var_matches) == 0:

            raise DataCoverageError('No variable matches between '
                                    '{} and {} for input vars: {}'.format(
                                        self.model_id, self.obs_id, obs_vars))
        return var_matches
Пример #8
0
    def get_file_names(self, request):
        """Get all files that match the request specifications

        Parameters
        ----------
        request : :obj:`EbasSQLRequest` or :obj:`str`
            request specifications

        Returns
        -------
        list
            list of file paths that match the request
        """
        try:
            names = [f[0] for f in self.execute_request(request)]
            if not len(names) > 0:
                raise DataCoverageError(
                    'No files could be found for request {}'.format(request))
        except Exception as e:
            raise e

        return names
Пример #9
0
    def _run_gridded_gridded(self, var_name=None):

        start, stop = start_stop(self.start, self.stop)
        model_reader = ReadGridded(self.model_id)
        obs_reader = ReadGridded(self.obs_id)

        if 'obs_filters' in self:
            remaining_filters = self._eval_obs_filters()
            if bool(remaining_filters):
                raise NotImplementedError(
                    'Cannot apply filters {} to gridded '
                    'observation data.'.format(remaining_filters))

        obs_vars = self.obs_vars

        obs_vars_avail = obs_reader.vars_provided

        for obs_var in obs_vars:
            if not obs_var in obs_vars_avail:
                raise DataCoverageError(
                    'Variable {} is not supported by {}'.format(
                        obs_var, self.obs_id))

        var_matches = self._find_var_matches(obs_vars, model_reader, var_name)
        if self.remove_outliers:
            self._update_var_outlier_ranges(var_matches)

        all_ts_types = const.GRID_IO.TS_TYPES

        ts_type = self.ts_type

        data_objs = {}

        for model_var, obs_var in var_matches.items():

            print_log.info('Running {} / {} ({}, {})'.format(
                self.model_id, self.obs_id, model_var, obs_var))
            try:
                model_data = self._read_gridded(reader=model_reader,
                                                var_name=model_var,
                                                start=start,
                                                stop=stop,
                                                is_model=True)
            except Exception as e:

                msg = (
                    'Failed to load gridded data: {} / {}. Reason {}'.format(
                        self.model_id, model_var, repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')

                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)
                else:
                    continue

            if not model_data.ts_type in all_ts_types:
                raise TemporalResolutionError('Invalid temporal resolution {} '
                                              'in model {}'.format(
                                                  model_data.ts_type,
                                                  self.model_id))
            try:
                obs_data = self._read_gridded(reader=obs_reader,
                                              var_name=obs_var,
                                              start=start,
                                              stop=stop,
                                              is_model=False)
            except Exception as e:

                msg = (
                    'Failed to load gridded data: {} / {}. Reason {}'.format(
                        self.model_id, model_var, repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')

                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)
                else:
                    continue

            if not obs_data.ts_type in all_ts_types:
                raise TemporalResolutionError('Invalid temporal resolution {} '
                                              'in obs {}'.format(
                                                  obs_data.ts_type,
                                                  self.model_id))

            # update colocation ts_type, based on the available resolution in
            # model and obs.
            lowest = self.get_lowest_resolution(ts_type, model_data.ts_type,
                                                obs_data.ts_type)
            if lowest != ts_type:
                print_log.info('Updating ts_type from {} to {} (highest '
                               'available in {} / {} combination)'.format(
                                   ts_type, lowest, self.model_id,
                                   self.obs_id))
                ts_type = lowest

            if self.save_coldata:
                out_dir = chk_make_subdir(self.basedir_coldata, self.model_id)

                savename = self._coldata_savename(model_data,
                                                  start,
                                                  stop,
                                                  ts_type,
                                                  var_name=model_var)

                file_exists = self._check_coldata_exists(
                    self.model_id, savename)
                if file_exists:
                    if not self.reanalyse_existing:
                        if self._log:
                            self._write_log('SKIP: {}\n'.format(savename))
                            print_log.info('Skip {} (file already '
                                           'exists)'.format(savename))
                        continue
                    else:
                        os.remove(os.path.join(out_dir, savename))
            try:
                by = None
                if self.model_use_climatology:
                    by = to_pandas_timestamp(start).year
                coldata = colocate_gridded_gridded(
                        gridded_data=model_data,
                        gridded_data_ref=obs_data,
                        ts_type=ts_type,
                        start=start, stop=stop,
                        filter_name=self.filter_name,
                        regrid_res_deg=self.regrid_res_deg,
                        remove_outliers=self.remove_outliers,
                        vert_scheme=self.vert_scheme,
                        harmonise_units=self.harmonise_units,
                        var_outlier_ranges=self.var_outlier_ranges,
                        var_ref_outlier_ranges=self.var_ref_outlier_ranges,
                        update_baseyear_gridded=by,
                        apply_time_resampling_constraints=\
                            self.apply_time_resampling_constraints,
                        min_num_obs=self.min_num_obs,
                        colocate_time=self.colocate_time,
                        var_keep_outliers=self.model_keep_outliers,
                        var_ref_keep_outliers=self.obs_keep_outliers)
                if self.save_coldata:
                    self._save_coldata(coldata, savename, out_dir, model_var,
                                       model_data, obs_var)
                    #coldata.to_netcdf(out_dir, savename=savename)
                if self._log:
                    self._write_log('WRITE: {}\n'.format(savename))
                    print_log.info('Writing file {}'.format(savename))
                data_objs[model_var] = coldata
            except Exception as e:
                msg = ('Colocation between model {} / {} and obs {} / {} '
                       'failed: Reason {}'.format(self.model_id, model_var,
                                                  self.obs_id, obs_var,
                                                  repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg)
                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)
        return data_objs
Пример #10
0
    def _run_gridded_ungridded(self, var_name=None):
        """Analysis method for gridded vs. ungridded data"""
        model_reader = ReadGridded(self.model_id)

        obs_reader = ReadUngridded(self.obs_id)

        obs_vars_supported = obs_reader.get_reader(
            self.obs_id).PROVIDES_VARIABLES

        obs_vars = list(np.intersect1d(self.obs_vars, obs_vars_supported))

        if len(obs_vars) == 0:
            raise DataCoverageError(
                'No observation variable matches found for '
                '{}'.format(self.obs_id))

        var_matches = self._find_var_matches(obs_vars, model_reader, var_name)

        if self.read_opts_ungridded is not None:
            ropts = self.read_opts_ungridded
        else:
            ropts = {}
        obs_data = obs_reader.read(datasets_to_read=self.obs_id,
                                   vars_to_retrieve=obs_vars,
                                   **ropts)
        if 'obs_filters' in self:
            remaining_filters = self._eval_obs_filters()
            obs_data = obs_data.apply_filters(**remaining_filters)

        if self.remove_outliers:
            self._update_var_outlier_ranges(var_matches)

        #all_ts_types = const.GRID_IO.TS_TYPES

        data_objs = {}
        for model_var, obs_var in var_matches.items():

            ts_type = self.ts_type
            start, stop = start_stop(self.start, self.stop)
            print_log.info('Running {} / {} ({}, {})'.format(
                self.model_id, self.obs_id, model_var, obs_var))
            try:
                model_data = self._read_gridded(reader=model_reader,
                                                var_name=model_var,
                                                start=start,
                                                stop=stop,
                                                is_model=True)
            except Exception as e:

                msg = (
                    'Failed to load gridded data: {} / {}. Reason {}'.format(
                        self.model_id, model_var, repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')

                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)
                else:
                    continue
            ts_type_src = model_data.ts_type
            # =============================================================================
            #             if not model_data.ts_type in all_ts_types:
            #                 raise TemporalResolutionError('Invalid temporal resolution {} '
            #                                               'in model {}'.format(model_data.ts_type,
            #                                                                    self.model_id))
            # =============================================================================
            ignore_stats = None
            if self.ignore_station_names is not None:
                ignore_stats = self.ignore_station_names
                if isinstance(ignore_stats, dict):
                    if obs_var in ignore_stats:
                        ignore_stats = ignore_stats[obs_var]
                    else:
                        ignore_stats = None

            #ts_type_src = model_data.ts_type
            if TsType(ts_type_src) < TsType(
                    ts_type):  # < all_ts_types.index(ts_type_src):
                print_log.info('Updating ts_type from {} to {} (highest '
                               'available in model {})'.format(
                                   ts_type, ts_type_src, self.model_id))
                ts_type = ts_type_src

            if self.save_coldata:
                savename = self._coldata_savename(model_data,
                                                  start,
                                                  stop,
                                                  ts_type,
                                                  var_name=model_var)

                file_exists = self._check_coldata_exists(
                    model_data.data_id, savename)

                out_dir = chk_make_subdir(self.basedir_coldata, self.model_id)
                if file_exists:
                    if not self.reanalyse_existing:
                        if self._log:
                            self._write_log('SKIP: {}\n'.format(savename))
                            print_log.info('Skip {} (file already '
                                           'exists)'.format(savename))
                            self.file_status[savename] = 'skipped'
                        continue
                    else:
                        print_log.info(
                            'Deleting and recomputing existing '
                            'colocated data file {}'.format(savename))
                        print_log.info('REMOVE: {}\n'.format(savename))
                        os.remove(os.path.join(out_dir, savename))

            try:
                by = None
                if self.model_use_climatology:
                    by = start.year
                coldata = colocate_gridded_ungridded(
                    gridded_data=model_data,
                    ungridded_data=obs_data,
                    ts_type=ts_type,
                    start=start,
                    stop=stop,
                    var_ref=obs_var,
                    filter_name=self.filter_name,
                    regrid_res_deg=self.regrid_res_deg,
                    remove_outliers=self.remove_outliers,
                    vert_scheme=self.vert_scheme,
                    harmonise_units=self.harmonise_units,
                    var_outlier_ranges=self.var_outlier_ranges,
                    var_ref_outlier_ranges=self.var_ref_outlier_ranges,
                    update_baseyear_gridded=by,
                    ignore_station_names=ignore_stats,
                    apply_time_resampling_constraints=self.
                    apply_time_resampling_constraints,
                    min_num_obs=self.min_num_obs,
                    colocate_time=self.colocate_time,
                    var_keep_outliers=self.model_keep_outliers,
                    var_ref_keep_outliers=self.obs_keep_outliers)

                if self.save_coldata:
                    self._save_coldata(coldata, savename, out_dir, model_var,
                                       model_data, obs_var)
                data_objs[model_var] = coldata
            except Exception as e:
                msg = ('Colocation between model {} / {} and obs {} / {} '
                       'failed: Reason {}'.format(self.model_id, model_var,
                                                  self.obs_id, obs_var,
                                                  repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')
                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)

        return data_objs
Пример #11
0
 def _run_gridded_gridded(self):
 
     start, stop = self.start, self.stop
     model_reader = ReadGridded(self.model_id, start, stop)
     obs_reader = ReadGridded(self.obs_id, start, stop)
 
     vars_to_analyse = self.vars_to_analyse
     if vars_to_analyse is None:
         vars_to_analyse = model_reader.vars_provided
         
     var_matches = {}
     for var in vars_to_analyse:
         if var in model_reader.vars_provided: #candidate
             # first check if the variable pair was defined explicitely
             if var in self.alt_vars:
                 if self.alt_vars[var] in obs_reader.vars_provided:
                     var_matches[var] = self.alt_vars[var]
             else:
                 if var in obs_reader.vars_provided:
                     var_matches[var] = var
     
     if len(var_matches) == 0:
         raise DataCoverageError('No variable matches between {} and {} for '
                                 'input vars: {}'.format(self.model_id, 
                                                         self.obs_id, 
                                                         self.vars_to_analyse))
         
     all_ts_types = const.GRID_IO.TS_TYPES
     ts_types_ana = self.ts_types_ana
     if ts_types_ana is None:
         ts_types_ana = self._setup.TS_TYPES_ANA_DEFAULT['gridded']
     
     ts_types_read = self.ts_types_read
     if ts_types_read is None:
         ts_types_read = model_reader.ts_types
     
     vars_model = list(var_matches.keys())
     vars_obs = list(var_matches.values())
     flex_obs = self._setup.options.TS_TYPE_OBS_FLEX
     for ts_type_read in ts_types_read:
         # reads only year if starttime is provided but not stop time
         model_data_vars = model_reader.read(vars_model, 
                                             start=start,
                                             stop=stop,
                                             ts_type=ts_type_read,
                                             flex_ts_type=False)
         
         if len(model_data_vars) == 0:
             if self._log:    
                 self._log.write('No model data available ({}-{}, {})\n'
                                 .format(start, stop, ts_type_read))
             continue
         
         obs_data_vars = obs_reader.read(vars_obs, 
                                         start=start,
                                         stop=stop,
                                         ts_type=ts_type_read,
                                         flex_ts_type=flex_obs)
         if len(obs_data_vars) == 0:
             if self._log:    
                 self._log.write('No obs data available for variables {} '
                                 '({}-{}, {})\n'
                                 .format(vars_obs, start, stop, 
                                         ts_type_read))
             continue
         
         for model_data in model_data_vars:
             var = model_data.var_name
             obs_data = None
             for _obs in obs_data_vars:
                 if _obs.var_name == var_matches[var]:
                     obs_data = _obs
                     break
             if obs_data is None:
                 if self._log:    
                     self._log.write('No obs data available for model var {} '
                                     '({}-{}, {})\n'
                                     .format(var, start, stop, 
                                         ts_type_read))
                 continue
             for ts_type_ana in ts_types_ana:
                 # model resolution (ts_type) must be equal or higher 
                 # than the current analysis setting (since )
                 if all_ts_types.index(ts_type_ana) >= all_ts_types.index(ts_type_read):
                     out_dir = chk_make_subdir(self.output_dir('colocate'),
                                               self.model_id)
                                               
                     savename = self._coldata_save_name(model_data,
                                                        ts_type_ana, 
                                                        start,
                                                        stop)
                     
                     file_exists = self._check_coldata_exists(self.model_id,
                                                               savename)
                     if file_exists:
                         if not self.options.REANALYSE_EXISTING:
                             if self._log:
                                 self._log.write('SKIP: {}\n'.format(savename))
                                 print_log.info('Skip {} (file already '
                                                'exists)'.format(savename))
                             continue
                         else:
                             os.remove(os.path.join(out_dir, savename))
                         
                     data_coll = colocate_gridded_gridded(
                                     model_data, obs_data, 
                                     ts_type=ts_type_ana, 
                                     start=start, stop=stop, 
                                     filter_name=self.filter_name)
                     self._last_coldata = data_coll
                     if data_coll.save_name_aerocom + '.nc' != savename:
                         raise Exception
                     data_coll.to_netcdf(out_dir)
                     if self._log:
                         self._log.write('WRITE: {}\n'.format(savename))
                         print_log.info('Writing {}'.format(savename))
Пример #12
0
 def _run_gridded_ungridded(self):
     """Analysis method for gridded vs. ungridded data"""
     start, stop = self.start, self.stop
     model_reader = ReadGridded(self.model_id, start, stop)
     
     obs_reader = ReadUngridded(self.obs_id)
     obs_vars = obs_reader.get_reader(self.obs_id).PROVIDES_VARIABLES
 
     vars_to_analyse = self.vars_to_analyse
     if vars_to_analyse is None:
         vars_to_analyse = model_reader.vars_provided
         
     var_matches = {}
     
     for var in vars_to_analyse:
         if var in model_reader.vars_provided: #candidate
             if var in self.alt_vars:
                 if self.alt_vars[var] in obs_vars:
                     var_matches[var] = self.alt_vars[var]
             else:
                 if var in obs_vars:
                     var_matches[var] = var
     
     if len(var_matches) == 0:
         
         raise DataCoverageError('No variable matches between '
                                 '{} and {} for input vars: {}'
                                 .format(self.model_id, 
                                         self.obs_id, 
                                         self.vars_to_analyse))
         
     all_ts_types = const.GRID_IO.TS_TYPES
     ts_types_ana = self.ts_types_ana
     if ts_types_ana is None:
         ts_types_ana = self._setup.TS_TYPES_ANA_DEFAULT['ungridded']
     
     ts_types_read = self.ts_types_read
     if ts_types_read is None:
         ts_types_read = model_reader.ts_types
     
     
     vars_model = list(var_matches.keys())
     vars_obs = list(var_matches.values())
     
     obs_data = obs_reader.read(datasets_to_read=self.obs_id, 
                                vars_to_retrieve=vars_obs)
     
     for ts_type_read in ts_types_read:
         model_data_vars = model_reader.read(vars_model, 
                                             start=start,
                                             stop=stop,
                                             ts_type=ts_type_read,
                                             flex_ts_type=False)
                     
         if len(model_data_vars)==0:
             if self._log:    
                 self._log.write('No model data available ({}-{}, {})\n'
                                 .format(start, stop, ts_type_read))
             continue
         
         for model_data in model_data_vars:
             var = model_data.var_info.var_name
             obs_var = var_matches[var]
             if not obs_var in obs_reader.data:
                 if self._log:    
                     self._log.write('No obs data available for variable {} '
                                     '({}-{}, {})\n'
                                     .format(obs_var, start, stop, 
                                             ts_type_read))
                 continue
             for ts_type_ana in ts_types_ana:
 
                 if all_ts_types.index(ts_type_ana) >= all_ts_types.index(ts_type_read):
                 
                     out_dir = chk_make_subdir(self.output_dir('colocate'),
                                               self.model_id)
                     savename = self._coldata_save_name(model_data,
                                                        ts_type_ana, 
                                                        start,
                                                        stop)
                     file_exists = self._check_coldata_exists(
                                                         self.model_id, 
                                                         savename)
                     if file_exists:
                         if not self.options.REANALYSE_EXISTING:
                             if self._log:
                                 self._log.write('SKIP: {}\n'
                                                 .format(savename))
                                 print_log.info('Skip {} (file already '
                                                'exists)'.format(savename))
                             continue
                         else:
                             os.remove(os.path.join(out_dir, savename))
                     
                     data_coll = colocate_gridded_ungridded_2D(
                                             model_data, obs_data, 
                                             ts_type=ts_type_ana, 
                                             start=start, stop=stop,
                                             var_ref=obs_var,
                                             filter_name=self.filter_name)
                     self._last_coldata = data_coll
                     data_coll.to_netcdf(out_dir)
                     if self._log:
                         self._log.write('WRITE: {}\n'.format(savename))
                         print_log.info('Writing {}'.format(savename))
                     
                     plt.close('all')
Пример #13
0
def merge_station_data(stats,
                       var_name,
                       pref_attr=None,
                       sort_by_largest=True,
                       fill_missing_nan=True,
                       **add_meta_keys):
    """Merge multiple StationData objects (from one station) into one instance
    
    Note
    ----
    - all input :class:`StationData` objects need to have same attributes\
       ``station_name``, ``latitude``, ``longitude`` and ``altitude``
    
    Parameters
    ----------
    stats : list
        list containing :class:`StationData` objects (note: all of these 
        objects must contain variable data for the specified input variable)
    var_name : str
        data variable name that is to be merged
    pref_attr 
        optional argument that may be used to specify a metadata attribute
        that is available in all input :class:`StationData` objects and that
        is used to order the input stations by relevance. The associated values
        of this attribute need to be sortable (e.g. revision_date). This is 
        only relevant in case overlaps occur. If unspecified the relevance of 
        the stations is sorted based on the length of the associated data 
        arrays.
    sort_by_largest : bool
        if True, the result from the sorting is inverted. E.g. if 
        ``pref_attr`` is unspecified, then the stations will be sorted based on
        the length of the data vectors, starting with the shortest, ending with
        the longest. This sorting result will then be inverted, if 
        ``sort_by_largest=True``, so that the longest time series get's highest
        importance. If, e.g. ``pref_attr='revision_date'``, then the stations 
        are sorted by the associated revision date value, starting with the 
        earliest, ending with the latest (which will also be inverted if 
        this argument is set to True)
    fill_missing_nan : bool
        if True, the resulting time series is filled with NaNs. NOTE: this 
        requires that information about the temporal resolution (ts_type) of
        the data is available in each of the StationData objects.
    """
    # make sure the data is provided as pandas.Series object
    for stat in stats:
        if not var_name in stat:
            raise DataCoverageError(
                'All input station must contain {} data'.format(var_name))
        elif not isinstance(stat[var_name], pd.Series):
            try:
                stat._to_ts_helper(var_name)
            except Exception as e:
                raise ValueError(
                    'Data needs to be provided as pandas Series in '
                    'individual station data objects. Attempted to'
                    'convert but failed with the following '
                    'exception: {}'.format(repr(e)))
        elif fill_missing_nan:
            try:
                stat.get_var_ts_type(var_name)
            except MetaDataError:
                raise MetaDataError(
                    'Cannot merge StationData objects: one or '
                    'more of the provided objects does not '
                    'provide information about the ts_type of '
                    'the {} data, which is required when input '
                    'arg. fill_missing_nan is True.'.format(var_name))

    if pref_attr is not None:
        stats.sort(key=lambda s: s[pref_attr])
    else:
        stats.sort(key=lambda s: len(s[var_name].dropna()))

    if sort_by_largest:
        stats = stats[::-1]

    # remove first station from the list
    first = stats.pop(0)

    for i, stat in enumerate(stats):
        first.merge_other(stat, var_name, **add_meta_keys)
        #first.merge_vardata(stat, var_name)

    if fill_missing_nan:
        first.insert_nans(var_name)
    return first
Пример #14
0
    def _run_gridded_ungridded(self, var_name=None):
        """Analysis method for gridded vs. ungridded data"""
        print_log.info('PREPARING colocation of {} vs. {}'.format(
            self.model_id, self.obs_id))

        model_reader = self.instantiate_gridded_reader(what='model')
        obs_reader = ReadUngridded(self.obs_id, data_dir=self.obs_data_dir)

        obs_vars = obs_reader.get_vars_supported(self.obs_id, self.obs_vars)

        if len(obs_vars) == 0:
            raise DataCoverageError(
                'No observation variable matches found for '
                '{}'.format(self.obs_id))

        var_matches = self._find_var_matches(obs_vars, model_reader, var_name)

        print_log.info(
            'The following variable combinations will be colocated\n'
            'MODEL-VAR\tOBS-VAR')
        for key, val in var_matches.items():
            print_log.info('{}\t{}'.format(key, val))

        # get list of unique observation variables
        obs_vars = np.unique(list(var_matches.values())).tolist()

        if self.remove_outliers:
            self._update_var_outlier_ranges(var_matches)

        if self.read_opts_ungridded is not None:
            ropts = self.read_opts_ungridded
        else:
            ropts = {}

        data_objs = {}
        if self.start is None:
            self._infer_start_stop(model_reader)

        start, stop = start_stop(self.start, self.stop)

        for model_var, obs_var in var_matches.items():

            # ToDo: consider removing outliers already here.
            #if 'obs_filters' in self:
            ts_type = self.ts_type
            print_log.info('Running {} / {} ({}, {})'.format(
                self.model_id, self.obs_id, model_var, obs_var))

            try:
                model_data = self._read_gridded(reader=model_reader,
                                                var_name=model_var,
                                                start=start,
                                                stop=stop,
                                                is_model=True)
            except Exception as e:

                msg = (
                    'Failed to load gridded data: {} / {}. Reason {}'.format(
                        self.model_id, model_var, repr(e)))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')

                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)
                else:
                    continue
            ts_type_src = model_data.ts_type
            rshow = self._eval_resample_how(model_var, obs_var)
            if ts_type is None:
                # if colocation frequency is not specified
                ts_type = ts_type_src

            ignore_stats = None
            if self.ignore_station_names is not None:
                ignore_stats = self.ignore_station_names
                if isinstance(ignore_stats, dict):
                    if obs_var in ignore_stats:
                        ignore_stats = ignore_stats[obs_var]
                    else:
                        ignore_stats = None

            #ts_type_src = model_data.ts_type
            if TsType(ts_type_src) < TsType(
                    ts_type):  # < all_ts_types.index(ts_type_src):
                print_log.info('Updating ts_type from {} to {} (highest '
                               'available in model {})'.format(
                                   ts_type, ts_type_src, self.model_id))
                ts_type = ts_type_src

            really_do_reanalysis = True
            if self.save_coldata:
                really_do_reanalysis = False
                savename = self._coldata_savename(model_data,
                                                  start,
                                                  stop,
                                                  ts_type,
                                                  var_name=model_var)

                file_exists = self._check_coldata_exists(
                    model_data.data_id, savename)

                out_dir = chk_make_subdir(self.basedir_coldata, self.model_id)
                if file_exists:
                    if not self.reanalyse_existing:
                        if self._log:
                            self._write_log('SKIP: {}\n'.format(savename))
                            print_log.info('Skip {} (file already '
                                           'exists)'.format(savename))
                            self.file_status[savename] = 'skipped'
                        continue
                    else:
                        really_do_reanalysis = True
                        print_log.info(
                            'Deleting and recomputing existing '
                            'colocated data file {}'.format(savename))
                        print_log.info('REMOVE: {}\n'.format(savename))
                        os.remove(os.path.join(out_dir, savename))
                else:
                    really_do_reanalysis = True

            if really_do_reanalysis:
                #Reading obs data only if the co-located data file does
                #not already exist.
                #This part of the method has been changed by @hansbrenna to work better with
                #large observational data sets. Only one variable is loaded into
                # the UngriddedData object at a time. Currently the variable is
                #re-read a lot of times, which is a weakness.
                obs_data = obs_reader.read(vars_to_retrieve=obs_var,
                                           only_cached=self._obs_cache_only,
                                           **ropts)

                # ToDo: consider removing outliers already here.
                if 'obs_filters' in self:
                    remaining_filters = self._eval_obs_filters()
                    obs_data = obs_data.apply_filters(**remaining_filters)

            try:
                try:
                    by = self.update_baseyear_gridded
                    stop = None
                except AttributeError:
                    by = None
                if self.model_use_climatology:
                    by = start.year
                coldata = colocate_gridded_ungridded(
                    gridded_data=model_data,
                    ungridded_data=obs_data,
                    ts_type=ts_type,
                    start=start,
                    stop=stop,
                    var_ref=obs_var,
                    filter_name=self.filter_name,
                    regrid_res_deg=self.regrid_res_deg,
                    remove_outliers=self.remove_outliers,
                    vert_scheme=self.vert_scheme,
                    harmonise_units=self.harmonise_units,
                    var_outlier_ranges=self.var_outlier_ranges,
                    var_ref_outlier_ranges=self.var_ref_outlier_ranges,
                    update_baseyear_gridded=by,
                    ignore_station_names=ignore_stats,
                    apply_time_resampling_constraints=self.
                    apply_time_resampling_constraints,
                    min_num_obs=self.min_num_obs,
                    colocate_time=self.colocate_time,
                    var_keep_outliers=self.model_keep_outliers,
                    var_ref_keep_outliers=self.obs_keep_outliers,
                    use_climatology_ref=self.obs_use_climatology,
                    resample_how=rshow)

                if self.model_to_stp:
                    coldata = correct_model_stp_coldata(coldata)
                if self.save_coldata:
                    self._save_coldata(coldata, savename, out_dir, model_var,
                                       model_data, obs_var)
                data_objs[model_var] = coldata
            except Exception:
                msg = ('Colocation between model {} / {} and obs {} / {} '
                       'failed.\nTraceback:\n{}'.format(
                           self.model_id, model_var, self.obs_id, obs_var,
                           traceback.format_exc()))
                const.print_log.warning(msg)
                self._write_log(msg + '\n')
                if self.raise_exceptions:
                    self._close_log()
                    raise Exception(msg)

        return data_objs
Пример #15
0
def merge_station_data(stats,
                       var_name,
                       pref_attr=None,
                       sort_by_largest=True,
                       fill_missing_nan=True,
                       **add_meta_keys):
    """Merge multiple StationData objects (from one station) into one instance
    
    Note
    ----
    - all input :class:`StationData` objects need to have same attributes\
       ``station_name``, ``latitude``, ``longitude`` and ``altitude``
    
    Parameters
    ----------
    stats : list
        list containing :class:`StationData` objects (note: all of these 
        objects must contain variable data for the specified input variable)
    var_name : str
        data variable name that is to be merged
    pref_attr 
        optional argument that may be used to specify a metadata attribute
        that is available in all input :class:`StationData` objects and that
        is used to order the input stations by relevance. The associated values
        of this attribute need to be sortable (e.g. revision_date). This is 
        only relevant in case overlaps occur. If unspecified the relevance of 
        the stations is sorted based on the length of the associated data 
        arrays.
    sort_by_largest : bool
        if True, the result from the sorting is inverted. E.g. if 
        ``pref_attr`` is unspecified, then the stations will be sorted based on
        the length of the data vectors, starting with the shortest, ending with
        the longest. This sorting result will then be inverted, if 
        ``sort_by_largest=True``, so that the longest time series get's highest
        importance. If, e.g. ``pref_attr='revision_date'``, then the stations 
        are sorted by the associated revision date value, starting with the 
        earliest, ending with the latest (which will also be inverted if 
        this argument is set to True)
    fill_missing_nan : bool
        if True, the resulting time series is filled with NaNs. NOTE: this 
        requires that information about the temporal resolution (ts_type) of
        the data is available in each of the StationData objects.
    """
    if isinstance(var_name, list):
        if len(var_name) > 1:
            raise NotImplementedError(
                'Merging of multivar data not yet possible')
        var_name = var_name[0]

    # make sure the data is provided as pandas.Series object
    is_3d, has_errs = False, False
    for stat in stats:
        if not var_name in stat:
            raise DataCoverageError(
                'All input station must contain {} data'.format(var_name))
        elif pref_attr is not None and not pref_attr in stat:
            raise MetaDataError(
                'Cannot sort station relevance by attribute {}. '
                'At least one of the input stations does not '
                'contain this attribute'.format(pref_attr))
        elif not isinstance(stat[var_name], pd.Series):
            try:
                stat._to_ts_helper(var_name)
            except Exception as e:
                raise ValueError(
                    'Data needs to be provided as pandas Series in '
                    'individual station data objects. Attempted to'
                    'convert but failed with the following '
                    'exception: {}'.format(repr(e)))
        elif fill_missing_nan:
            try:
                stat.get_var_ts_type(var_name)
            except MetaDataError:
                raise MetaDataError(
                    'Cannot merge StationData objects: one or '
                    'more of the provided objects does not '
                    'provide information about the ts_type of '
                    'the {} data, which is required when input '
                    'arg. fill_missing_nan is True.'.format(var_name))
        if stat.check_if_3d(var_name):
            is_3d = True
        elif is_3d:
            raise ValueError('Merge error: some of the input stations contain '
                             'altitude info (suggesting profile data), others '
                             'not.')
        if var_name in stat.data_err:
            has_errs = True

    if not is_3d:
        if pref_attr is not None:
            stats.sort(key=lambda s: s[pref_attr])
        else:
            stats.sort(key=lambda s: len(s[var_name].dropna()))

        if sort_by_largest:
            stats = stats[::-1]

        # remove first station from the list
        merged = stats.pop(0)

        for i, stat in enumerate(stats):
            merged.merge_other(stat, var_name, **add_meta_keys)
    else:
        from xarray import DataArray
        dtime = []
        for stat in stats:
            _t = stat[var_name].index.unique()
            if not len(_t) == 1:
                raise NotImplementedError('So far, merging of profile data '
                                          'requires that profile values are '
                                          'sampled at the same time')
            dtime.append(_t[0])
        tidx = pd.DatetimeIndex(dtime)

        # AeroCom default vertical grid
        vert_grid = const.make_default_vert_grid()
        _data = np.ones((len(vert_grid), len(tidx))) * np.nan
        if has_errs:
            _data_err = np.ones((len(vert_grid), len(tidx))) * np.nan

        for i, stat in enumerate(stats):
            #print(stat[var_name].values)
            if i == 0:
                merged = stat
            else:
                merged.merge_meta_same_station(stat, **add_meta_keys)

            _data[:, i] = np.interp(vert_grid, stat['altitude'],
                                    stat[var_name].values)

            if has_errs:
                try:
                    _data_err[:, i] = np.interp(vert_grid, stat['altitude'],
                                                stat.data_err[var_name])
                except:
                    pass
        _coords = {'time': tidx, 'altitude': vert_grid}

        d = DataArray(data=_data,
                      coords=_coords,
                      dims=['altitude', 'time'],
                      name=var_name)
        d = d.sortby('time')
        merged[var_name] = d
        merged.dtime = d.time
        merged.altitude = d.altitude

    if fill_missing_nan:
        try:
            merged.insert_nans_timeseries(var_name)
        except Exception as e:
            const.print_log.warning('Could not insert NaNs into timeseries of '
                                    'variable {} after merging stations. '
                                    'Reason: {}'.format(var_name, repr(e)))

    merged['stat_merge_pref_attr'] = pref_attr
    return merged