def _read_gridded(self, reader, var_name, is_model=True, **kwargs): try: start = kwargs.pop('start') except KeyError: start = self.start try: stop = kwargs.pop('stop') except KeyError: stop = self.stop if is_model: vert_which = self.obs_vert_type ts_type_read = self.model_ts_type_read if self.model_use_climatology: start = 9999 stop = None else: vert_which = None ts_type_read = self.obs_ts_type_read try: # set defaults if input was not specified explicitely if ts_type_read is None and not self.flex_ts_type_gridded: ts_type_read = self.ts_type if not 'vert_which' in kwargs: kwargs['vert_which'] = vert_which if not 'ts_type' in kwargs: kwargs['ts_type'] = ts_type_read if isinstance(kwargs['ts_type'], dict): kwargs['ts_type'] = kwargs['ts_type'][var_name] return reader.read_var(var_name, start=start, stop=stop, flex_ts_type=self.flex_ts_type_gridded, **kwargs) except DataCoverageError: vt = None if is_model: if self.obs_vert_type in self.OBS_VERT_TYPES_ALT: vt = self.OBS_VERT_TYPES_ALT[self.obs_vert_type] elif self.model_vert_type_alt is not None: mva = self.model_vert_type_alt if isinstance(mva, str): vt = mva elif isinstance(mva, dict) and var_name in mva: vt = mva[var_name] if vt is None: raise DataCoverageError( ('No data files available for dataset ' '{} ({})'.format(reader.data_id, var_name))) return reader.read_var(var_name, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=self.flex_ts_type_gridded, vert_which=vt)
def read_ungridded(self, vars_to_read=None): """Helper to read UngriddedData Note ---- Currently not used in main processing method :func:`_run_gridded_ungridded`. But should be. Parameters ---------- vars_to_read : str or list, optional variables that should be read from obs-network (:attr:`obs_id`) Returns ------- UngriddedData loaded data object """ if isinstance(vars_to_read, str): vars_to_read = [vars_to_read] obs_reader = ReadUngridded(self.obs_id, data_dir=self.obs_data_dir) if vars_to_read is None: vars_to_read = self.obs_vars obs_vars = [] for var in vars_to_read: if var in obs_reader.get_reader(self.obs_id).PROVIDES_VARIABLES: obs_vars.append(var) else: const.print_log.warning('Variable {} is not supported by {} ' 'and will be skipped'.format( var, self.obs_id)) if len(obs_vars) == 0: raise DataCoverageError( 'No observation variable matches found for ' '{}'.format(self.obs_id)) if self.read_opts_ungridded is not None: ropts = self.read_opts_ungridded else: ropts = {} obs_data = obs_reader.read(datasets_to_read=self.obs_id, vars_to_retrieve=obs_vars, **ropts) if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() obs_data = obs_data.apply_filters(**remaining_filters) if self.remove_outliers: #self._update_var_outlier_ranges(obs_vars=obs_vars) for var in obs_vars: low, high = None, None try: low, high = self.var_ref_outlier_ranges[var] except Exception: pass obs_data.remove_outliers(var, inplace=True, low=low, high=high) return obs_data
def read_model_data(self, var_name, **kwargs): """Read model variable data based on colocation setup Parameters ---------- var_name : str variable to be read Returns ------- GriddedData variable data """ use_input_var = False if 'use_input_var' in kwargs: use_input_var = kwargs.pop('use_input_var') reader = self.instantiate_gridded_reader(what='model') if use_input_var: var = var_name else: try: var_matches = self._find_var_matches(var_name, reader) except DataCoverageError: raise DataCoverageError('No match could be found in {} for ' 'variable {}'.format( self.model_id, var_name)) var = list(var_matches.keys())[0] return self._read_gridded(reader, var, is_model=True, **kwargs)
def _find_var_matches(self, obs_vars, model_reader, var_name=None): """Find variable matches in model data for input obs variables""" var_matches = {} muv, mav = {}, {} if isinstance(self.model_use_vars, dict): muv = self.model_use_vars if isinstance(self.model_add_vars, dict): mav = self.model_add_vars for obs_var in obs_vars: if obs_var in muv: model_var = muv[obs_var] else: model_var = obs_var self._check_add_model_read_aux(model_var, model_reader) if model_reader.has_var(model_var): var_matches[model_var] = obs_var if obs_var in mav: #observation variable model_add_var = mav[obs_var] self._check_add_model_read_aux(model_add_var, model_reader) if model_reader.has_var(model_add_var): var_matches[model_add_var] = obs_var for obs_var, obs_var_altname in self.obs_vars_rename.items(): if obs_var_altname in var_matches: raise AttributeError('{} match was already found for obs ' 'var to be renamed {}...'.format( obs_var_altname, obs_var)) if model_reader.has_var(obs_var_altname): var_matches[obs_var_altname] = obs_var if var_name is not None: if isinstance(var_name, str): var_name = [var_name] if not isinstance(var_name, list): raise ValueError('Invalid input for var_name. Need str or ' 'list, got {}'.format(var_name)) _var_matches = {} for mvar, ovar in var_matches.items(): if mvar in var_name or ovar in var_name: _var_matches[mvar] = ovar var_matches = _var_matches if len(var_matches) == 0: raise DataCoverageError('No variable matches between ' '{} and {} for input vars: {}'.format( self.model_id, self.obs_id, self.obs_vars)) return var_matches
def _read_gridded(self, reader, var_name, start, stop, is_model=True): if is_model: vert_which = self.obs_vert_type if all(x == '' for x in reader.file_info.vert_code.values): print_log.info('Deactivating model file search by vertical ' 'code for {}, since filenames do not include ' 'information about vertical code (probably ' 'AeroCom 2 convention)'.format(reader.data_id)) vert_which = None ts_type_read = self.model_ts_type_read if self.model_use_climatology: start = 9999 stop = None else: vert_which = None ts_type_read = self.obs_ts_type_read msg = ('No data files available for dataset {} ({})'.format( reader.data_id, var_name)) try: return reader.read_var(var_name, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=self.flex_ts_type_gridded, vert_which=vert_which) except DataCoverageError: vt = None if is_model: if self.obs_vert_type in self.OBS_VERT_TYPES_ALT: vt = self.OBS_VERT_TYPES_ALT[self.obs_vert_type] elif self.model_vert_type_alt is not None: mva = self.model_vert_type_alt if isinstance(mva, str): vt = mva elif isinstance(mva, dict) and var_name in mva: vt = mva[var_name] if vt is None: raise DataCoverageError(msg) return reader.read_var(var_name, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=self.flex_ts_type_gridded, vert_which=vt)
def compute_trends_station(station, var_name, start_year=None, stop_year=None, season=None, slope_confidence=0.68, **alt_range): # load additional information about data source (if applicable) if not 'trends' in station: station['trends'] = od() tr = station['trends'] if not var_name in tr: station['trends'][var_name] = trv = TrendsEngine(var_name) else: trv = station['trends'][var_name] freq = station.get_var_ts_type(var_name) ts_types = const.GRID_IO.TS_TYPES if not trv.has_daily: if not freq in ts_types or (ts_types.index(freq) <= ts_types.index('daily')): trv['daily'] = station.to_timeseries(var_name, freq='daily', **alt_range) # monthly is mandatory if not trv.has_monthly: if freq in ts_types and ts_types.index(freq) >= ts_types.index('monthly'): raise TemporalResolutionError('Need monthly or higher') ms = station.to_timeseries(var_name, freq='monthly', **alt_range) trv['monthly'] = ms else: ms = trv['monthly'] if len(ms) == 0 or all(np.isnan(ms)): raise DataCoverageError('Failed to retrieve monthly timeseries for ' '{} ({})'.format(station.station_name, var_name)) if trv._mobs is None: trv._mobs = _make_mobs_dataframe(ms) result = trv.compute_trend(start_year, stop_year, season, slope_confidence) trv.meta.update(station.get_meta(add_none_vals=True)) if var_name in station.var_info: trv.meta.update(station.var_info[var_name]) return result
def _find_var_matches(self, obs_vars, model_reader, var_name=None): """Find variable matches in model data for input obs variables""" if isinstance(obs_vars, str): obs_vars = [obs_vars] # dictionary that will map model variables (keys) with observation variables (values) var_matches = {} muv = self.model_use_vars if isinstance(self.model_use_vars, dict) else {} for obs_var in obs_vars: if obs_var in muv: model_var = muv[obs_var] else: model_var = obs_var try: self._check_add_model_read_aux(model_var, model_reader) if model_reader.has_var(model_var): var_matches[model_var] = obs_var var_matches = self._check_model_add_var( obs_var, model_reader, var_matches) except VariableDefinitionError: continue if var_name is not None: _var_matches = {} for mvar, ovar in var_matches.items(): if mvar in var_name or ovar in var_name: _var_matches[mvar] = ovar var_matches = _var_matches if len(var_matches) == 0: raise DataCoverageError('No variable matches between ' '{} and {} for input vars: {}'.format( self.model_id, self.obs_id, obs_vars)) return var_matches
def get_file_names(self, request): """Get all files that match the request specifications Parameters ---------- request : :obj:`EbasSQLRequest` or :obj:`str` request specifications Returns ------- list list of file paths that match the request """ try: names = [f[0] for f in self.execute_request(request)] if not len(names) > 0: raise DataCoverageError( 'No files could be found for request {}'.format(request)) except Exception as e: raise e return names
def _run_gridded_gridded(self, var_name=None): start, stop = start_stop(self.start, self.stop) model_reader = ReadGridded(self.model_id) obs_reader = ReadGridded(self.obs_id) if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() if bool(remaining_filters): raise NotImplementedError( 'Cannot apply filters {} to gridded ' 'observation data.'.format(remaining_filters)) obs_vars = self.obs_vars obs_vars_avail = obs_reader.vars_provided for obs_var in obs_vars: if not obs_var in obs_vars_avail: raise DataCoverageError( 'Variable {} is not supported by {}'.format( obs_var, self.obs_id)) var_matches = self._find_var_matches(obs_vars, model_reader, var_name) if self.remove_outliers: self._update_var_outlier_ranges(var_matches) all_ts_types = const.GRID_IO.TS_TYPES ts_type = self.ts_type data_objs = {} for model_var, obs_var in var_matches.items(): print_log.info('Running {} / {} ({}, {})'.format( self.model_id, self.obs_id, model_var, obs_var)) try: model_data = self._read_gridded(reader=model_reader, var_name=model_var, start=start, stop=stop, is_model=True) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue if not model_data.ts_type in all_ts_types: raise TemporalResolutionError('Invalid temporal resolution {} ' 'in model {}'.format( model_data.ts_type, self.model_id)) try: obs_data = self._read_gridded(reader=obs_reader, var_name=obs_var, start=start, stop=stop, is_model=False) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue if not obs_data.ts_type in all_ts_types: raise TemporalResolutionError('Invalid temporal resolution {} ' 'in obs {}'.format( obs_data.ts_type, self.model_id)) # update colocation ts_type, based on the available resolution in # model and obs. lowest = self.get_lowest_resolution(ts_type, model_data.ts_type, obs_data.ts_type) if lowest != ts_type: print_log.info('Updating ts_type from {} to {} (highest ' 'available in {} / {} combination)'.format( ts_type, lowest, self.model_id, self.obs_id)) ts_type = lowest if self.save_coldata: out_dir = chk_make_subdir(self.basedir_coldata, self.model_id) savename = self._coldata_savename(model_data, start, stop, ts_type, var_name=model_var) file_exists = self._check_coldata_exists( self.model_id, savename) if file_exists: if not self.reanalyse_existing: if self._log: self._write_log('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) continue else: os.remove(os.path.join(out_dir, savename)) try: by = None if self.model_use_climatology: by = to_pandas_timestamp(start).year coldata = colocate_gridded_gridded( gridded_data=model_data, gridded_data_ref=obs_data, ts_type=ts_type, start=start, stop=stop, filter_name=self.filter_name, regrid_res_deg=self.regrid_res_deg, remove_outliers=self.remove_outliers, vert_scheme=self.vert_scheme, harmonise_units=self.harmonise_units, var_outlier_ranges=self.var_outlier_ranges, var_ref_outlier_ranges=self.var_ref_outlier_ranges, update_baseyear_gridded=by, apply_time_resampling_constraints=\ self.apply_time_resampling_constraints, min_num_obs=self.min_num_obs, colocate_time=self.colocate_time, var_keep_outliers=self.model_keep_outliers, var_ref_keep_outliers=self.obs_keep_outliers) if self.save_coldata: self._save_coldata(coldata, savename, out_dir, model_var, model_data, obs_var) #coldata.to_netcdf(out_dir, savename=savename) if self._log: self._write_log('WRITE: {}\n'.format(savename)) print_log.info('Writing file {}'.format(savename)) data_objs[model_var] = coldata except Exception as e: msg = ('Colocation between model {} / {} and obs {} / {} ' 'failed: Reason {}'.format(self.model_id, model_var, self.obs_id, obs_var, repr(e))) const.print_log.warning(msg) self._write_log(msg) if self.raise_exceptions: self._close_log() raise Exception(msg) return data_objs
def _run_gridded_ungridded(self, var_name=None): """Analysis method for gridded vs. ungridded data""" model_reader = ReadGridded(self.model_id) obs_reader = ReadUngridded(self.obs_id) obs_vars_supported = obs_reader.get_reader( self.obs_id).PROVIDES_VARIABLES obs_vars = list(np.intersect1d(self.obs_vars, obs_vars_supported)) if len(obs_vars) == 0: raise DataCoverageError( 'No observation variable matches found for ' '{}'.format(self.obs_id)) var_matches = self._find_var_matches(obs_vars, model_reader, var_name) if self.read_opts_ungridded is not None: ropts = self.read_opts_ungridded else: ropts = {} obs_data = obs_reader.read(datasets_to_read=self.obs_id, vars_to_retrieve=obs_vars, **ropts) if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() obs_data = obs_data.apply_filters(**remaining_filters) if self.remove_outliers: self._update_var_outlier_ranges(var_matches) #all_ts_types = const.GRID_IO.TS_TYPES data_objs = {} for model_var, obs_var in var_matches.items(): ts_type = self.ts_type start, stop = start_stop(self.start, self.stop) print_log.info('Running {} / {} ({}, {})'.format( self.model_id, self.obs_id, model_var, obs_var)) try: model_data = self._read_gridded(reader=model_reader, var_name=model_var, start=start, stop=stop, is_model=True) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue ts_type_src = model_data.ts_type # ============================================================================= # if not model_data.ts_type in all_ts_types: # raise TemporalResolutionError('Invalid temporal resolution {} ' # 'in model {}'.format(model_data.ts_type, # self.model_id)) # ============================================================================= ignore_stats = None if self.ignore_station_names is not None: ignore_stats = self.ignore_station_names if isinstance(ignore_stats, dict): if obs_var in ignore_stats: ignore_stats = ignore_stats[obs_var] else: ignore_stats = None #ts_type_src = model_data.ts_type if TsType(ts_type_src) < TsType( ts_type): # < all_ts_types.index(ts_type_src): print_log.info('Updating ts_type from {} to {} (highest ' 'available in model {})'.format( ts_type, ts_type_src, self.model_id)) ts_type = ts_type_src if self.save_coldata: savename = self._coldata_savename(model_data, start, stop, ts_type, var_name=model_var) file_exists = self._check_coldata_exists( model_data.data_id, savename) out_dir = chk_make_subdir(self.basedir_coldata, self.model_id) if file_exists: if not self.reanalyse_existing: if self._log: self._write_log('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) self.file_status[savename] = 'skipped' continue else: print_log.info( 'Deleting and recomputing existing ' 'colocated data file {}'.format(savename)) print_log.info('REMOVE: {}\n'.format(savename)) os.remove(os.path.join(out_dir, savename)) try: by = None if self.model_use_climatology: by = start.year coldata = colocate_gridded_ungridded( gridded_data=model_data, ungridded_data=obs_data, ts_type=ts_type, start=start, stop=stop, var_ref=obs_var, filter_name=self.filter_name, regrid_res_deg=self.regrid_res_deg, remove_outliers=self.remove_outliers, vert_scheme=self.vert_scheme, harmonise_units=self.harmonise_units, var_outlier_ranges=self.var_outlier_ranges, var_ref_outlier_ranges=self.var_ref_outlier_ranges, update_baseyear_gridded=by, ignore_station_names=ignore_stats, apply_time_resampling_constraints=self. apply_time_resampling_constraints, min_num_obs=self.min_num_obs, colocate_time=self.colocate_time, var_keep_outliers=self.model_keep_outliers, var_ref_keep_outliers=self.obs_keep_outliers) if self.save_coldata: self._save_coldata(coldata, savename, out_dir, model_var, model_data, obs_var) data_objs[model_var] = coldata except Exception as e: msg = ('Colocation between model {} / {} and obs {} / {} ' 'failed: Reason {}'.format(self.model_id, model_var, self.obs_id, obs_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) return data_objs
def _run_gridded_gridded(self): start, stop = self.start, self.stop model_reader = ReadGridded(self.model_id, start, stop) obs_reader = ReadGridded(self.obs_id, start, stop) vars_to_analyse = self.vars_to_analyse if vars_to_analyse is None: vars_to_analyse = model_reader.vars_provided var_matches = {} for var in vars_to_analyse: if var in model_reader.vars_provided: #candidate # first check if the variable pair was defined explicitely if var in self.alt_vars: if self.alt_vars[var] in obs_reader.vars_provided: var_matches[var] = self.alt_vars[var] else: if var in obs_reader.vars_provided: var_matches[var] = var if len(var_matches) == 0: raise DataCoverageError('No variable matches between {} and {} for ' 'input vars: {}'.format(self.model_id, self.obs_id, self.vars_to_analyse)) all_ts_types = const.GRID_IO.TS_TYPES ts_types_ana = self.ts_types_ana if ts_types_ana is None: ts_types_ana = self._setup.TS_TYPES_ANA_DEFAULT['gridded'] ts_types_read = self.ts_types_read if ts_types_read is None: ts_types_read = model_reader.ts_types vars_model = list(var_matches.keys()) vars_obs = list(var_matches.values()) flex_obs = self._setup.options.TS_TYPE_OBS_FLEX for ts_type_read in ts_types_read: # reads only year if starttime is provided but not stop time model_data_vars = model_reader.read(vars_model, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=False) if len(model_data_vars) == 0: if self._log: self._log.write('No model data available ({}-{}, {})\n' .format(start, stop, ts_type_read)) continue obs_data_vars = obs_reader.read(vars_obs, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=flex_obs) if len(obs_data_vars) == 0: if self._log: self._log.write('No obs data available for variables {} ' '({}-{}, {})\n' .format(vars_obs, start, stop, ts_type_read)) continue for model_data in model_data_vars: var = model_data.var_name obs_data = None for _obs in obs_data_vars: if _obs.var_name == var_matches[var]: obs_data = _obs break if obs_data is None: if self._log: self._log.write('No obs data available for model var {} ' '({}-{}, {})\n' .format(var, start, stop, ts_type_read)) continue for ts_type_ana in ts_types_ana: # model resolution (ts_type) must be equal or higher # than the current analysis setting (since ) if all_ts_types.index(ts_type_ana) >= all_ts_types.index(ts_type_read): out_dir = chk_make_subdir(self.output_dir('colocate'), self.model_id) savename = self._coldata_save_name(model_data, ts_type_ana, start, stop) file_exists = self._check_coldata_exists(self.model_id, savename) if file_exists: if not self.options.REANALYSE_EXISTING: if self._log: self._log.write('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) continue else: os.remove(os.path.join(out_dir, savename)) data_coll = colocate_gridded_gridded( model_data, obs_data, ts_type=ts_type_ana, start=start, stop=stop, filter_name=self.filter_name) self._last_coldata = data_coll if data_coll.save_name_aerocom + '.nc' != savename: raise Exception data_coll.to_netcdf(out_dir) if self._log: self._log.write('WRITE: {}\n'.format(savename)) print_log.info('Writing {}'.format(savename))
def _run_gridded_ungridded(self): """Analysis method for gridded vs. ungridded data""" start, stop = self.start, self.stop model_reader = ReadGridded(self.model_id, start, stop) obs_reader = ReadUngridded(self.obs_id) obs_vars = obs_reader.get_reader(self.obs_id).PROVIDES_VARIABLES vars_to_analyse = self.vars_to_analyse if vars_to_analyse is None: vars_to_analyse = model_reader.vars_provided var_matches = {} for var in vars_to_analyse: if var in model_reader.vars_provided: #candidate if var in self.alt_vars: if self.alt_vars[var] in obs_vars: var_matches[var] = self.alt_vars[var] else: if var in obs_vars: var_matches[var] = var if len(var_matches) == 0: raise DataCoverageError('No variable matches between ' '{} and {} for input vars: {}' .format(self.model_id, self.obs_id, self.vars_to_analyse)) all_ts_types = const.GRID_IO.TS_TYPES ts_types_ana = self.ts_types_ana if ts_types_ana is None: ts_types_ana = self._setup.TS_TYPES_ANA_DEFAULT['ungridded'] ts_types_read = self.ts_types_read if ts_types_read is None: ts_types_read = model_reader.ts_types vars_model = list(var_matches.keys()) vars_obs = list(var_matches.values()) obs_data = obs_reader.read(datasets_to_read=self.obs_id, vars_to_retrieve=vars_obs) for ts_type_read in ts_types_read: model_data_vars = model_reader.read(vars_model, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=False) if len(model_data_vars)==0: if self._log: self._log.write('No model data available ({}-{}, {})\n' .format(start, stop, ts_type_read)) continue for model_data in model_data_vars: var = model_data.var_info.var_name obs_var = var_matches[var] if not obs_var in obs_reader.data: if self._log: self._log.write('No obs data available for variable {} ' '({}-{}, {})\n' .format(obs_var, start, stop, ts_type_read)) continue for ts_type_ana in ts_types_ana: if all_ts_types.index(ts_type_ana) >= all_ts_types.index(ts_type_read): out_dir = chk_make_subdir(self.output_dir('colocate'), self.model_id) savename = self._coldata_save_name(model_data, ts_type_ana, start, stop) file_exists = self._check_coldata_exists( self.model_id, savename) if file_exists: if not self.options.REANALYSE_EXISTING: if self._log: self._log.write('SKIP: {}\n' .format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) continue else: os.remove(os.path.join(out_dir, savename)) data_coll = colocate_gridded_ungridded_2D( model_data, obs_data, ts_type=ts_type_ana, start=start, stop=stop, var_ref=obs_var, filter_name=self.filter_name) self._last_coldata = data_coll data_coll.to_netcdf(out_dir) if self._log: self._log.write('WRITE: {}\n'.format(savename)) print_log.info('Writing {}'.format(savename)) plt.close('all')
def merge_station_data(stats, var_name, pref_attr=None, sort_by_largest=True, fill_missing_nan=True, **add_meta_keys): """Merge multiple StationData objects (from one station) into one instance Note ---- - all input :class:`StationData` objects need to have same attributes\ ``station_name``, ``latitude``, ``longitude`` and ``altitude`` Parameters ---------- stats : list list containing :class:`StationData` objects (note: all of these objects must contain variable data for the specified input variable) var_name : str data variable name that is to be merged pref_attr optional argument that may be used to specify a metadata attribute that is available in all input :class:`StationData` objects and that is used to order the input stations by relevance. The associated values of this attribute need to be sortable (e.g. revision_date). This is only relevant in case overlaps occur. If unspecified the relevance of the stations is sorted based on the length of the associated data arrays. sort_by_largest : bool if True, the result from the sorting is inverted. E.g. if ``pref_attr`` is unspecified, then the stations will be sorted based on the length of the data vectors, starting with the shortest, ending with the longest. This sorting result will then be inverted, if ``sort_by_largest=True``, so that the longest time series get's highest importance. If, e.g. ``pref_attr='revision_date'``, then the stations are sorted by the associated revision date value, starting with the earliest, ending with the latest (which will also be inverted if this argument is set to True) fill_missing_nan : bool if True, the resulting time series is filled with NaNs. NOTE: this requires that information about the temporal resolution (ts_type) of the data is available in each of the StationData objects. """ # make sure the data is provided as pandas.Series object for stat in stats: if not var_name in stat: raise DataCoverageError( 'All input station must contain {} data'.format(var_name)) elif not isinstance(stat[var_name], pd.Series): try: stat._to_ts_helper(var_name) except Exception as e: raise ValueError( 'Data needs to be provided as pandas Series in ' 'individual station data objects. Attempted to' 'convert but failed with the following ' 'exception: {}'.format(repr(e))) elif fill_missing_nan: try: stat.get_var_ts_type(var_name) except MetaDataError: raise MetaDataError( 'Cannot merge StationData objects: one or ' 'more of the provided objects does not ' 'provide information about the ts_type of ' 'the {} data, which is required when input ' 'arg. fill_missing_nan is True.'.format(var_name)) if pref_attr is not None: stats.sort(key=lambda s: s[pref_attr]) else: stats.sort(key=lambda s: len(s[var_name].dropna())) if sort_by_largest: stats = stats[::-1] # remove first station from the list first = stats.pop(0) for i, stat in enumerate(stats): first.merge_other(stat, var_name, **add_meta_keys) #first.merge_vardata(stat, var_name) if fill_missing_nan: first.insert_nans(var_name) return first
def _run_gridded_ungridded(self, var_name=None): """Analysis method for gridded vs. ungridded data""" print_log.info('PREPARING colocation of {} vs. {}'.format( self.model_id, self.obs_id)) model_reader = self.instantiate_gridded_reader(what='model') obs_reader = ReadUngridded(self.obs_id, data_dir=self.obs_data_dir) obs_vars = obs_reader.get_vars_supported(self.obs_id, self.obs_vars) if len(obs_vars) == 0: raise DataCoverageError( 'No observation variable matches found for ' '{}'.format(self.obs_id)) var_matches = self._find_var_matches(obs_vars, model_reader, var_name) print_log.info( 'The following variable combinations will be colocated\n' 'MODEL-VAR\tOBS-VAR') for key, val in var_matches.items(): print_log.info('{}\t{}'.format(key, val)) # get list of unique observation variables obs_vars = np.unique(list(var_matches.values())).tolist() if self.remove_outliers: self._update_var_outlier_ranges(var_matches) if self.read_opts_ungridded is not None: ropts = self.read_opts_ungridded else: ropts = {} data_objs = {} if self.start is None: self._infer_start_stop(model_reader) start, stop = start_stop(self.start, self.stop) for model_var, obs_var in var_matches.items(): # ToDo: consider removing outliers already here. #if 'obs_filters' in self: ts_type = self.ts_type print_log.info('Running {} / {} ({}, {})'.format( self.model_id, self.obs_id, model_var, obs_var)) try: model_data = self._read_gridded(reader=model_reader, var_name=model_var, start=start, stop=stop, is_model=True) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue ts_type_src = model_data.ts_type rshow = self._eval_resample_how(model_var, obs_var) if ts_type is None: # if colocation frequency is not specified ts_type = ts_type_src ignore_stats = None if self.ignore_station_names is not None: ignore_stats = self.ignore_station_names if isinstance(ignore_stats, dict): if obs_var in ignore_stats: ignore_stats = ignore_stats[obs_var] else: ignore_stats = None #ts_type_src = model_data.ts_type if TsType(ts_type_src) < TsType( ts_type): # < all_ts_types.index(ts_type_src): print_log.info('Updating ts_type from {} to {} (highest ' 'available in model {})'.format( ts_type, ts_type_src, self.model_id)) ts_type = ts_type_src really_do_reanalysis = True if self.save_coldata: really_do_reanalysis = False savename = self._coldata_savename(model_data, start, stop, ts_type, var_name=model_var) file_exists = self._check_coldata_exists( model_data.data_id, savename) out_dir = chk_make_subdir(self.basedir_coldata, self.model_id) if file_exists: if not self.reanalyse_existing: if self._log: self._write_log('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) self.file_status[savename] = 'skipped' continue else: really_do_reanalysis = True print_log.info( 'Deleting and recomputing existing ' 'colocated data file {}'.format(savename)) print_log.info('REMOVE: {}\n'.format(savename)) os.remove(os.path.join(out_dir, savename)) else: really_do_reanalysis = True if really_do_reanalysis: #Reading obs data only if the co-located data file does #not already exist. #This part of the method has been changed by @hansbrenna to work better with #large observational data sets. Only one variable is loaded into # the UngriddedData object at a time. Currently the variable is #re-read a lot of times, which is a weakness. obs_data = obs_reader.read(vars_to_retrieve=obs_var, only_cached=self._obs_cache_only, **ropts) # ToDo: consider removing outliers already here. if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() obs_data = obs_data.apply_filters(**remaining_filters) try: try: by = self.update_baseyear_gridded stop = None except AttributeError: by = None if self.model_use_climatology: by = start.year coldata = colocate_gridded_ungridded( gridded_data=model_data, ungridded_data=obs_data, ts_type=ts_type, start=start, stop=stop, var_ref=obs_var, filter_name=self.filter_name, regrid_res_deg=self.regrid_res_deg, remove_outliers=self.remove_outliers, vert_scheme=self.vert_scheme, harmonise_units=self.harmonise_units, var_outlier_ranges=self.var_outlier_ranges, var_ref_outlier_ranges=self.var_ref_outlier_ranges, update_baseyear_gridded=by, ignore_station_names=ignore_stats, apply_time_resampling_constraints=self. apply_time_resampling_constraints, min_num_obs=self.min_num_obs, colocate_time=self.colocate_time, var_keep_outliers=self.model_keep_outliers, var_ref_keep_outliers=self.obs_keep_outliers, use_climatology_ref=self.obs_use_climatology, resample_how=rshow) if self.model_to_stp: coldata = correct_model_stp_coldata(coldata) if self.save_coldata: self._save_coldata(coldata, savename, out_dir, model_var, model_data, obs_var) data_objs[model_var] = coldata except Exception: msg = ('Colocation between model {} / {} and obs {} / {} ' 'failed.\nTraceback:\n{}'.format( self.model_id, model_var, self.obs_id, obs_var, traceback.format_exc())) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) return data_objs
def merge_station_data(stats, var_name, pref_attr=None, sort_by_largest=True, fill_missing_nan=True, **add_meta_keys): """Merge multiple StationData objects (from one station) into one instance Note ---- - all input :class:`StationData` objects need to have same attributes\ ``station_name``, ``latitude``, ``longitude`` and ``altitude`` Parameters ---------- stats : list list containing :class:`StationData` objects (note: all of these objects must contain variable data for the specified input variable) var_name : str data variable name that is to be merged pref_attr optional argument that may be used to specify a metadata attribute that is available in all input :class:`StationData` objects and that is used to order the input stations by relevance. The associated values of this attribute need to be sortable (e.g. revision_date). This is only relevant in case overlaps occur. If unspecified the relevance of the stations is sorted based on the length of the associated data arrays. sort_by_largest : bool if True, the result from the sorting is inverted. E.g. if ``pref_attr`` is unspecified, then the stations will be sorted based on the length of the data vectors, starting with the shortest, ending with the longest. This sorting result will then be inverted, if ``sort_by_largest=True``, so that the longest time series get's highest importance. If, e.g. ``pref_attr='revision_date'``, then the stations are sorted by the associated revision date value, starting with the earliest, ending with the latest (which will also be inverted if this argument is set to True) fill_missing_nan : bool if True, the resulting time series is filled with NaNs. NOTE: this requires that information about the temporal resolution (ts_type) of the data is available in each of the StationData objects. """ if isinstance(var_name, list): if len(var_name) > 1: raise NotImplementedError( 'Merging of multivar data not yet possible') var_name = var_name[0] # make sure the data is provided as pandas.Series object is_3d, has_errs = False, False for stat in stats: if not var_name in stat: raise DataCoverageError( 'All input station must contain {} data'.format(var_name)) elif pref_attr is not None and not pref_attr in stat: raise MetaDataError( 'Cannot sort station relevance by attribute {}. ' 'At least one of the input stations does not ' 'contain this attribute'.format(pref_attr)) elif not isinstance(stat[var_name], pd.Series): try: stat._to_ts_helper(var_name) except Exception as e: raise ValueError( 'Data needs to be provided as pandas Series in ' 'individual station data objects. Attempted to' 'convert but failed with the following ' 'exception: {}'.format(repr(e))) elif fill_missing_nan: try: stat.get_var_ts_type(var_name) except MetaDataError: raise MetaDataError( 'Cannot merge StationData objects: one or ' 'more of the provided objects does not ' 'provide information about the ts_type of ' 'the {} data, which is required when input ' 'arg. fill_missing_nan is True.'.format(var_name)) if stat.check_if_3d(var_name): is_3d = True elif is_3d: raise ValueError('Merge error: some of the input stations contain ' 'altitude info (suggesting profile data), others ' 'not.') if var_name in stat.data_err: has_errs = True if not is_3d: if pref_attr is not None: stats.sort(key=lambda s: s[pref_attr]) else: stats.sort(key=lambda s: len(s[var_name].dropna())) if sort_by_largest: stats = stats[::-1] # remove first station from the list merged = stats.pop(0) for i, stat in enumerate(stats): merged.merge_other(stat, var_name, **add_meta_keys) else: from xarray import DataArray dtime = [] for stat in stats: _t = stat[var_name].index.unique() if not len(_t) == 1: raise NotImplementedError('So far, merging of profile data ' 'requires that profile values are ' 'sampled at the same time') dtime.append(_t[0]) tidx = pd.DatetimeIndex(dtime) # AeroCom default vertical grid vert_grid = const.make_default_vert_grid() _data = np.ones((len(vert_grid), len(tidx))) * np.nan if has_errs: _data_err = np.ones((len(vert_grid), len(tidx))) * np.nan for i, stat in enumerate(stats): #print(stat[var_name].values) if i == 0: merged = stat else: merged.merge_meta_same_station(stat, **add_meta_keys) _data[:, i] = np.interp(vert_grid, stat['altitude'], stat[var_name].values) if has_errs: try: _data_err[:, i] = np.interp(vert_grid, stat['altitude'], stat.data_err[var_name]) except: pass _coords = {'time': tidx, 'altitude': vert_grid} d = DataArray(data=_data, coords=_coords, dims=['altitude', 'time'], name=var_name) d = d.sortby('time') merged[var_name] = d merged.dtime = d.time merged.altitude = d.altitude if fill_missing_nan: try: merged.insert_nans_timeseries(var_name) except Exception as e: const.print_log.warning('Could not insert NaNs into timeseries of ' 'variable {} after merging stations. ' 'Reason: {}'.format(var_name, repr(e))) merged['stat_merge_pref_attr'] = pref_attr return merged