def get_lowest_resolution(ts_type, *ts_types): """Get the lowest resolution from several ts_type codes Parameters ---------- ts_type : str first ts_type *ts_types one or more additional ts_type codes Returns ------- str the ts_type that corresponds to the lowest resolution Raises ------ ValueError if one of the input ts_type codes is not supported """ #all_ts_types = const.GRID_IO.TS_TYPES from pyaerocom.tstype import TsType lowest = TsType(ts_type) for freq in ts_types: # ============================================================================= # if not freq in all_ts_types: # raise ValueError('Invalid input, only valid ts_type codes are ' # 'supported: {}'.format(all_ts_types)) # ============================================================================= _temp = TsType(freq) if _temp < lowest: lowest = _temp return lowest.val
def _init_data_default_frequencies(coldata, colocation_settings): to_ts_types = ['daily', 'monthly', 'yearly'] data_arrs = dict.fromkeys(to_ts_types) jsdate = dict.fromkeys(to_ts_types) tt = TsType(coldata.ts_type) if tt < TsType('monthly'): raise TemporalResolutionError( 'Temporal resolution ({}) is too low for ' 'web processing, need monthly or higher'.format(tt)) elif tt > TsType('daily'): # resolution is higher than daily -> convert to daily coldata = _resample_time_coldata(coldata, 'daily', colocation_settings) tt = TsType('daily') for freq in to_ts_types: tt_freq = TsType(freq) if tt < tt_freq: # skip (coldata is in lower resolution) #data_arrs[freq] = None continue elif tt == tt_freq: data_arrs[freq] = coldata.copy() jsdate[freq] = _get_jsdate(coldata) else: cd = _resample_time_coldata(coldata, freq, colocation_settings) data_arrs[freq] = cd jsdate[freq] = _get_jsdate(cd) return (data_arrs, jsdate)
def test_next_lower(): try: TsType('yearly').next_lower except Exception as e: assert type(e) == IndexError assert str(TsType('3minutely').next_lower) == 'hourly'
def test_next_higher(): try: TsType('minutely').next_higher except Exception as e: assert type(e) == IndexError assert str(TsType('3minutely').next_higher) == 'minutely' assert str(TsType('hourly').next_higher) == 'minutely' assert str(TsType('monthly').next_higher) == 'weekly'
def test_basic_operators_pandas(): monthly = TsType('MS') yearly = TsType('AS') daily = TsType('D') assert monthly < daily assert monthly <= daily assert monthly != daily assert yearly < daily assert not (yearly == daily) assert monthly > yearly assert monthly >= yearly
def test_basic_operators(): monthly = TsType('monthly') yearly = TsType('yearly') daily = TsType('daily') assert monthly < daily assert monthly <= daily assert monthly != daily assert yearly < daily assert not (yearly == daily) assert monthly > yearly assert monthly >= yearly
def get_tot_number_of_seconds(ts_type, dtime=None): from pyaerocom.tstype import TsType ts_tpe = TsType(ts_type) if ts_tpe >= TsType('montly'): if dtime is None: raise AttributeError( 'For frequncies larger than or eq. monthly you' + 'need to provide dtime in order to compute the number of second. ' ) else: # find seconds from dtime return None else: return TS_TYPE_SECS[ts_type]
def correct_time_coord(cube, ts_type, year): """Method that corrects the time coordinate of an iris Cube Parameters ---------- cube : Cube cube containing data ts_type : TsType or str temporal resolution of data (e.g. "hourly", "daily"). This information is e.g. encrypted in the filename of a NetCDF file and may be accessed using :class:`pyaerocom.io.FileConventionRead` year : int interger specifying start year, e.g. 2017 Returns ------- Cube the same instance of the input cube with corrected time dimension axis """ tindex_cube = None dim_lens = [] if isinstance(ts_type, str): ts_type = TsType(ts_type) for i, coord in enumerate(cube.dim_coords): dim_lens.append(len(coord.points)) if coord.name() == 'time': tindex_cube = i if tindex_cube is None: if cube.ndim != len(cube.dim_coords): #one dimension is missing for idx, dim_len in enumerate(cube.shape): if not dim_len in dim_lens: #candidate tindex_cube = idx if tindex_cube is None: raise NetcdfError('Failed to identify data index of time dimension in ' 'cube {}'.format(repr(cube))) tres_str = ts_type.cf_base_unit conv = ts_type.datetime64_str tunit_str = '%s since %s-01-01 00:00:00' % (tres_str, year) num = cube.shape[tindex_cube] tunit = cf_units.Unit(tunit_str, calendar=cf_units.CALENDAR_STANDARD) tres_np = ts_type.timedelta64_str #TSTR_TO_NP_TD[ts_type] base = datetime64("%s-01-01 00:00:00" % year).astype(conv) times = base + arange(0, num, 1).astype(tres_np) # see this thread https://github.com/matplotlib/matplotlib/issues/2259/ times_dt = times.astype("datetime64[s]").astype(datetime) # timestamps = datetime64(str(year)) + time_nums = [tunit.date2num(t) for t in times_dt] tcoord = iris.coords.DimCoord(time_nums, standard_name='time', units=tunit) #tcoord_dim = cube.coord_dims('time') try: cube.remove_coord('time') except Exception: pass cube.add_dim_coord(tcoord, tindex_cube) return cube
def check_validity(self, file): """Check if filename is valid""" info = self.get_info_from_file(file) year = info["year"] if not TsType.valid(info['ts_type']): raise FileConventionError("Invalid ts_type %s in filename %s" % (info['ts_type'], basename(file))) elif not (const.MIN_YEAR <= year <= const.MAX_YEAR): raise FileConventionError("Invalid year %d in filename %s" % (info['year'], basename(file)))
def get_tot_number_of_seconds(ts_type, dtime=None): from pyaerocom.tstype import TsType ts_tpe = TsType(ts_type) if ts_tpe >= TsType('monthly'): if dtime is None: raise AttributeError( 'For frequncies larger than or eq. monthly you' + ' need to provide dtime in order to compute the number of second.' ) else: # find seconds from dtime # TODO generalize this days_in_month = dtime.dt.daysinmonth if ts_type == 'monthly': monthly_to_sec = days_in_month * 24 * 60 * 60 return monthly_to_sec else: return TS_TYPE_SECS[ts_type]
def _check_correct_time_dim(cube, file, file_convention=None): if file_convention is None: try: file_convention = FileConventionRead(from_file=file) except Exception: pass if not isinstance(file_convention, FileConventionRead): raise FileConventionError( 'Unknown file convention: {}'.format(file_convention)) finfo = file_convention.get_info_from_file(file) try: ts_type = TsType(finfo['ts_type']) except Exception: raise FileConventionError( 'Invalid ts_type in file: {}'.format(ts_type)) year = finfo['year'] if not const.MIN_YEAR <= year <= const.MAX_YEAR: raise FileConventionError('Invalid year in file: {}'.format(year)) try: check_time_coord(cube, ts_type, year) except UnresolvableTimeDefinitionError as e: raise UnresolvableTimeDefinitionError(repr(e)) except Exception: msg = ("Invalid time dimension coordinate in file {}. ".format( os.path.basename(file))) logger.warning(msg) if const.GRID_IO.CORRECT_TIME_FILENAME: logger.warning("Attempting to correct time coordinate " "using information in file name") try: cube = correct_time_coord(cube, ts_type=finfo["ts_type"], year=finfo["year"]) except Exception: pass if const.WRITE_FILEIO_ERR_LOG: add_file_to_log(file, 'Invalid time dimension') return cube
def test_to_timedelta64(ts_type, ref_time_str, np_dt_str, output_str): tref = np.datetime64(ref_time_str, np_dt_str) assert str(tref + TsType(ts_type).to_timedelta64()) == output_str
def _run_gridded_ungridded(self, var_name=None): """Analysis method for gridded vs. ungridded data""" model_reader = ReadGridded(self.model_id) obs_reader = ReadUngridded(self.obs_id) obs_vars_supported = obs_reader.get_reader( self.obs_id).PROVIDES_VARIABLES obs_vars = list(np.intersect1d(self.obs_vars, obs_vars_supported)) if len(obs_vars) == 0: raise DataCoverageError( 'No observation variable matches found for ' '{}'.format(self.obs_id)) var_matches = self._find_var_matches(obs_vars, model_reader, var_name) if self.read_opts_ungridded is not None: ropts = self.read_opts_ungridded else: ropts = {} obs_data = obs_reader.read(datasets_to_read=self.obs_id, vars_to_retrieve=obs_vars, **ropts) if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() obs_data = obs_data.apply_filters(**remaining_filters) if self.remove_outliers: self._update_var_outlier_ranges(var_matches) #all_ts_types = const.GRID_IO.TS_TYPES data_objs = {} for model_var, obs_var in var_matches.items(): ts_type = self.ts_type start, stop = start_stop(self.start, self.stop) print_log.info('Running {} / {} ({}, {})'.format( self.model_id, self.obs_id, model_var, obs_var)) try: model_data = self._read_gridded(reader=model_reader, var_name=model_var, start=start, stop=stop, is_model=True) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue ts_type_src = model_data.ts_type # ============================================================================= # if not model_data.ts_type in all_ts_types: # raise TemporalResolutionError('Invalid temporal resolution {} ' # 'in model {}'.format(model_data.ts_type, # self.model_id)) # ============================================================================= ignore_stats = None if self.ignore_station_names is not None: ignore_stats = self.ignore_station_names if isinstance(ignore_stats, dict): if obs_var in ignore_stats: ignore_stats = ignore_stats[obs_var] else: ignore_stats = None #ts_type_src = model_data.ts_type if TsType(ts_type_src) < TsType( ts_type): # < all_ts_types.index(ts_type_src): print_log.info('Updating ts_type from {} to {} (highest ' 'available in model {})'.format( ts_type, ts_type_src, self.model_id)) ts_type = ts_type_src if self.save_coldata: savename = self._coldata_savename(model_data, start, stop, ts_type, var_name=model_var) file_exists = self._check_coldata_exists( model_data.data_id, savename) out_dir = chk_make_subdir(self.basedir_coldata, self.model_id) if file_exists: if not self.reanalyse_existing: if self._log: self._write_log('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) self.file_status[savename] = 'skipped' continue else: print_log.info( 'Deleting and recomputing existing ' 'colocated data file {}'.format(savename)) print_log.info('REMOVE: {}\n'.format(savename)) os.remove(os.path.join(out_dir, savename)) try: by = None if self.model_use_climatology: by = start.year coldata = colocate_gridded_ungridded( gridded_data=model_data, ungridded_data=obs_data, ts_type=ts_type, start=start, stop=stop, var_ref=obs_var, filter_name=self.filter_name, regrid_res_deg=self.regrid_res_deg, remove_outliers=self.remove_outliers, vert_scheme=self.vert_scheme, harmonise_units=self.harmonise_units, var_outlier_ranges=self.var_outlier_ranges, var_ref_outlier_ranges=self.var_ref_outlier_ranges, update_baseyear_gridded=by, ignore_station_names=ignore_stats, apply_time_resampling_constraints=self. apply_time_resampling_constraints, min_num_obs=self.min_num_obs, colocate_time=self.colocate_time, var_keep_outliers=self.model_keep_outliers, var_ref_keep_outliers=self.obs_keep_outliers) if self.save_coldata: self._save_coldata(coldata, savename, out_dir, model_var, model_data, obs_var) data_objs[model_var] = coldata except Exception as e: msg = ('Colocation between model {} / {} and obs {} / {} ' 'failed: Reason {}'.format(self.model_id, model_var, self.obs_id, obs_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) return data_objs
def resample(self, to_ts_type, input_data=None, from_ts_type=None, how=None, apply_constraints=None, min_num_obs=None, **kwargs): """Resample input data Parameters ---------- to_ts_type : str or pyaerocom.tstype.TsType output resolution input_data : pandas.Series or xarray.DataArray data to be resampled how : str string specifying how the data is to be aggregated, default is mean apply_constraints : bool, optional if True, hierarchical resampling is applied using input `samping_constraints` (if provided) or else, using constraints specified in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE` min_num_obs : dict or int, optinal integer or nested dictionary specifying minimum number of observations required to resample from higher to lower frequency. For instance, if `input_data` is hourly and `to_ts_type` is monthly, you may specify something like:: min_num_obs = {'monthly' : {'daily' : 7}, 'daily' : {'hourly' : 6}} to require at least 6 hours per day and 7 days per month. **kwargs additional input arguments passed to resampling method Returns ------- pandas.Series or xarray.DataArray resampled data object """ if how is None: how = 'mean' if not isinstance(to_ts_type, TsType): to_ts_type = TsType(to_ts_type) if input_data is not None: self.input_data = input_data if self.input_data is None: raise ValueError('Please provide data (Series or DataArray)') if apply_constraints is None: apply_constraints = self.APPLY_CONSTRAINTS self.last_setup = dict(apply_constraints=False, min_num_obs=None, how=how) if not apply_constraints or from_ts_type is None: freq = to_ts_type.to_pandas_freq() if not isinstance(how, str): raise ValueError('Temporal resampling without constraints can ' 'only use string type argument how (e.g. ' 'how=mean). Got {}'.format(how)) return self.fun(self.input_data, freq=freq, how=how, **kwargs) # ============================================================================= # elif from_ts_type is None: # self.last_setup = dict(apply_constraints=False, # min_num_obs=None) # freq = to_ts_type.to_pandas_freq() # return self.fun(self.input_data, freq=freq, # how=how, **kwargs) # ============================================================================= if isinstance(from_ts_type, str): from_ts_type = TsType(from_ts_type) if not isinstance(from_ts_type, TsType): raise ValueError('Invalid input for from_ts_type: {}. Need valid ' 'str or TsType. Input arg from_ts_type is ' 'required if resampling using hierarchical ' 'constraints (arg apply_constraints) is activated' .format(from_ts_type.val)) if to_ts_type > from_ts_type: raise TemporalResolutionError('Cannot resample time-series from {} ' 'to {}' .format(from_ts_type, to_ts_type)) elif to_ts_type == from_ts_type: const.logger.info('Input time frequency {} equals current frequency ' 'of data. Resampling will be applied anyways ' 'which will introduce NaN values at missing ' 'time stamps'.format(to_ts_type.val)) freq = to_ts_type.to_pandas_freq() return self.fun(self.input_data, freq=freq, how='mean', **kwargs) if min_num_obs is None: min_num_obs = self.SAMPLING_CONSTRAINTS _idx = self._gen_idx(from_ts_type, to_ts_type, min_num_obs, how) data = self.input_data for to_ts_type, mno, rshow in _idx: const.logger.info('TO: {} ({}, {})'.format(to_ts_type, mno, rshow)) freq = TsType(to_ts_type).to_pandas_freq() data = self.fun(data, freq=freq, how=rshow, min_num_obs=mno) self.last_setup = dict(apply_constraints=True, min_num_obs=min_num_obs, how=how) return data
def __init__(self, input_data=None): self.last_setup = None self._input_data = None self.input_data = input_data self.valid_base_ts_types = [x for x in const.GRID_IO.TS_TYPES if TsType(x).mulfac==1]
def resample(self, to_ts_type, input_data=None, from_ts_type=None, how='mean', apply_constraints=None, sampling_constraints=None, **kwargs): """Resample input data Parameters ---------- input_data : pandas.Series or xarray.DataArray data to be resampled to_ts_type : str or pyaerocom.tstype.TsType output resolution how : str string specifying how the data is to be aggregated, default is mean apply_constraints : bool, optional if True, hierarchical resampling is applied using input `samping_constraints` (if provided) or else, using constraints specified in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE` sampling_constraints : dict nested dictionary specifying sampling constraints to be applied to data. For instance, if `input_data` is hourly and `to_ts_type` is monthly, you may specify something like:: sampling_constraints = {'monthly' : {'daily' : 7}, 'daily' : {'hourly' : 6}} to require at least 6 hours per day and 7 days per month. **kwargs additional input arguments passed to resampling method Returns ------- pandas.Series or xarray.DataArray resampled data object """ if not isinstance(to_ts_type, TsType): to_ts_type = TsType(to_ts_type) if not to_ts_type.val in self.FREQS_SUPPORTED: raise NotImplementedError('Cannot resample to input frequency ' '{}. Choose from: {}'.format( to_ts_type, self.FREQS_SUPPORTED.keys())) if input_data is not None: self.input_data = input_data if self.input_data is None: raise ValueError('Please provide data (Series or DataArray)') if apply_constraints is None: apply_constraints = self.APPLY_CONSTRAINTS if not apply_constraints: return self.fun(self.input_data, freq=to_ts_type.val, how=how, **kwargs) if isinstance(from_ts_type, str): from_ts_type = TsType(from_ts_type) if not isinstance(from_ts_type, TsType): raise ValueError( 'Invalid input for from_ts_type: {}. Need valid ' 'str or TsType. Input arg from_ts_type is ' 'required if resampling using hierarchical ' 'constraints (arg apply_constraints) is activated'.format( from_ts_type)) if sampling_constraints is None: sampling_constraints = self.SAMPLING_CONSTRAINTS _idx = self._gen_idx(from_ts_type, to_ts_type, sampling_constraints) data = self.input_data for to_ts_type, min_num_obs in _idx: data = self.fun(data, freq=to_ts_type, how=how, min_num_obs=min_num_obs) return data
def colocate_gridded_ungridded(gridded_data, ungridded_data, ts_type=None, start=None, stop=None, filter_name=None, regrid_res_deg=None, remove_outliers=True, vert_scheme=None, harmonise_units=True, regrid_scheme='areaweighted', var_ref=None, var_outlier_ranges=None, var_ref_outlier_ranges=None, update_baseyear_gridded=None, ignore_station_names=None, apply_time_resampling_constraints=None, min_num_obs=None, colocate_time=False, var_keep_outliers=True, var_ref_keep_outliers=False, use_climatology_ref=False, resample_how=None, **kwargs): """Colocate gridded with ungridded data (low level method) For high-level colocation see :class:`pyaerocom.colocation_auto.Colocator` and :class:`pyaerocom.colocation_auto.ColocationSetup` Note ---- Uses the variable that is contained in input :class:`GriddedData` object (since these objects only contain a single variable). If this variable is not contained in observation data (or contained but using a different variable name) you may specify the obs variable to be used via input arg `var_ref` Parameters ---------- gridded_data : GriddedData gridded data object (e.g. model results). ungridded_data : UngriddedData ungridded data object (e.g. observations). ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly.). start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used. stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). If None, then it is set to 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude mountain sites. regrid_res_deg : int or dict, optional regrid resolution in degrees. If specified, the input gridded data object will be regridded in lon / lat dimension to the input resolution (if input is integer, both lat and lon are regridded to that resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg` to specify regrid resolutions, respectively). remove_outliers : bool if True, outliers are removed from model and obs data before colocation, else not. Outlier ranges can be specified via input args `var_outlier_ranges` and `var_ref_outlier_ranges`. vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. harmonise_units : bool if True, units are attempted to be harmonised (note: raises Exception if True and units cannot be harmonised). var_ref : :obj:`str`, optional variable against which data in :attr:`gridded_data` is supposed to be compared. If None, then the same variable is used (i.e. `gridded_data.var_name`). var_outlier_ranges : dict, optional dictionary specifying outlier ranges for dataset to be analysed (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then the pyaerocom default outlier ranges are used for the input variable. Defaults to None. var_ref_outlier_ranges : dict, optional like `var_outlier_ranges` but for reference dataset. update_baseyear_gridded : int, optional optional input that can be set in order to re-define the time dimension in the gridded data object to be analysed. E.g., if the data object is a climatology (one year of data) that has set the base year of the time dimension to a value other than the specified input start / stop time this may be used to update the time in order to make colocation possible. ignore_station_names : str or list, optional station name or pattern or list of station names or patterns that should be ignored apply_time_resampling_constraints : bool, optional if True, then time resampling constraints are applied as provided via :attr:`min_num_obs` or if that one is unspecified, as defined in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used (which defaults to True !!). min_num_obs : int or dict, optional minimum number of observations for resampling of time colocate_time : bool if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. var_keep_outliers : bool if True, then no outliers will be removed from dataset to be analysed, even if `remove_outliers` is True. That is because for model evaluation often only outliers are supposed to be removed in the observations but not in the model. var_ref_keep_outliers : bool if True, then no outliers will be removed from the reference dataset, even if `remove_outliers` is True. use_climatology_ref : bool if True, climatological timeseries are used from observations resample_how : str or dict string specifying how data should be aggregated when resampling in time. Default is "mean". Can also be a nested dictionary, e.g. resample_how={'daily': {'hourly' : 'max'}} would use the maximum value to aggregate from hourly to daily, rather than the mean. **kwargs additional keyword args (passed to :func:`UngriddedData.to_station_data_all`) Returns ------- ColocatedData instance of colocated data Raises ------ VarNotAvailableError if grid data variable is not available in ungridded data object AttributeError if instance of input :class:`UngriddedData` object contains more than one dataset TimeMatchError if gridded data time range does not overlap with input time range ColocationError if none of the data points in input :class:`UngriddedData` matches the input colocation constraints """ if var_outlier_ranges is None: var_outlier_ranges = {} if var_ref_outlier_ranges is None: var_ref_outlier_ranges = {} if filter_name is None: filter_name = const.DEFAULT_REG_FILTER try: gridded_data.check_dimcoords_tseries() except DimensionOrderError: gridded_data.reorder_dimensions_tseries() var = gridded_data.var_name aerocom_var = gridded_data.var_name_aerocom _check_var_registered(var, aerocom_var, gridded_data) if var_ref is None: if aerocom_var is not None: var_ref = aerocom_var else: var_ref = var if remove_outliers: low, high, low_ref, high_ref = None, None, None, None if var in var_outlier_ranges: low, high = var_outlier_ranges[var] if var_ref in var_ref_outlier_ranges: low_ref, high_ref = var_ref_outlier_ranges[var_ref] if not var_ref in ungridded_data.contains_vars: raise VarNotAvailableError('Variable {} is not available in ungridded ' 'data (which contains {})'.format( var_ref, ungridded_data.contains_vars)) elif len(ungridded_data.contains_datasets) > 1: raise AttributeError('Colocation can only be performed with ' 'ungridded data objects that only contain a ' 'single dataset. Use method `extract_dataset` of ' 'UngriddedData object to extract single datasets') dataset_ref = ungridded_data.contains_datasets[0] if update_baseyear_gridded is not None: # update time dimension in gridded data gridded_data.base_year = update_baseyear_gridded grid_ts_type_src = gridded_data.ts_type grid_ts_type = TsType(gridded_data.ts_type) if isinstance(ts_type, str): ts_type = TsType(ts_type) if ts_type is None or grid_ts_type < ts_type: ts_type = grid_ts_type elif grid_ts_type > ts_type and not colocate_time: gridded_data = gridded_data.resample_time( str(ts_type), apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how) grid_ts_type = ts_type # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) if start < grid_start: start = grid_start if stop > grid_stop: stop = grid_stop # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) # create instance of Filter class (may, in the future, also include all # filter options, e.g. start, stop, variables, only land, only oceans, and # may also be linked with other data object, e.g. if data is only supposed # to be used if other data object exceeds a certain threshold... but for # now, only region and altitude range) regfilter = Filter(name=filter_name) # apply filter to data ungridded_data = regfilter.apply(ungridded_data) #crop time gridded_data = regfilter.apply(gridded_data) if start > grid_start or stop < grid_stop: gridded_data = gridded_data.crop(time_range=(start, stop)) if regrid_res_deg is not None: gridded_data = _regrid_gridded(gridded_data, regrid_scheme, regrid_res_deg) if remove_outliers and not var_ref_keep_outliers: #called twice if used via Colocator, this should go out here ungridded_data.remove_outliers(var_ref, inplace=True, low=low_ref, high=high_ref) if use_climatology_ref: col_freq = 'monthly' obs_start = const.CLIM_START obs_stop = const.CLIM_STOP else: col_freq = str(grid_ts_type) #TS_TYPE_TO_PANDAS_FREQ[grid_ts_type] obs_start = start obs_stop = stop latitude = gridded_data.latitude.points longitude = gridded_data.longitude.points lat_range = [np.min(latitude), np.max(latitude)] lon_range = [np.min(longitude), np.max(longitude)] ungridded_data = ungridded_data.filter_by_meta(latitude=lat_range, longitude=lon_range) # get timeseries from all stations in provided time resolution # (time resampling is done below in main loop) all_stats = ungridded_data.to_station_data_all( vars_to_convert=var_ref, start=obs_start, stop=obs_stop, by_station_name=True, ignore_index=ignore_station_names, **kwargs) obs_stat_data = all_stats['stats'] ungridded_lons = all_stats['longitude'] ungridded_lats = all_stats['latitude'] if len(obs_stat_data) == 0: raise VarNotAvailableError('Variable {} is not available in specified ' 'time interval ({}-{})'.format( var_ref, start, stop)) # make sure the gridded data is in the right dimension if gridded_data.ndim > 3: if vert_scheme is None: vert_scheme = 'mean' if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES: raise ValueError( 'Vertical scheme {} is not supported'.format(vert_scheme)) grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons, latitude=ungridded_lats, vert_scheme=vert_scheme) pd_freq = TsType(col_freq).to_pandas_freq() time_idx = make_datetime_index(start, stop, pd_freq) coldata = np.empty((2, len(time_idx), len(obs_stat_data))) lons = [] lats = [] alts = [] station_names = [] ungridded_unit = None ts_type_src_ref = None if not harmonise_units: gridded_unit = str(gridded_data.units) else: gridded_unit = None # loop over all stations and append to colocated data object for i, obs_stat in enumerate(obs_stat_data): # ToDo: consider removing to keep ts_type_src_ref (this was probably # introduced for EBAS were the original data frequency is not constant # but can vary from site to site) if ts_type_src_ref is None: ts_type_src_ref = obs_stat['ts_type_src'] elif obs_stat['ts_type_src'] != ts_type_src_ref: spl = ts_type_src_ref.split(';') if not obs_stat['ts_type_src'] in spl: spl.append(obs_stat['ts_type_src']) ts_type_src_ref = ';'.join(spl) if ungridded_unit is None: try: ungridded_unit = obs_stat['var_info'][var_ref]['units'] except KeyError as e: #variable information or unit is not defined logger.exception(repr(e)) try: unit = obs_stat['var_info'][var_ref]['units'] except Exception: unit = None if not unit == ungridded_unit: raise ValueError( 'Cannot perform colocation. Ungridded data ' 'object contains different units ({})'.format(var_ref)) # get observations (Note: the index of the observation time series # is already in the specified frequency format, and thus, does not # need to be updated, for details (or if errors occur), cf. # UngriddedData.to_station_data, where the conversion happens) # get model station data grid_stat = grid_stat_data[i] if harmonise_units: grid_unit = grid_stat.get_unit(var) obs_unit = obs_stat.get_unit(var_ref) if not grid_unit == obs_unit: grid_stat.convert_unit(var, obs_unit) if gridded_unit is None: gridded_unit = obs_unit if remove_outliers and not var_keep_outliers: # don't check if harmonise_units is active, because the # remove_outliers method checks units based on AeroCom default # variables, and a variable mapping might be active, i.e. # sometimes models use abs550aer for absorption coefficients # with units [m-1] and not for AAOD (which is the AeroCom default # and unitless. Hence, unit check in remove_outliers works only # if the variable name (and unit) corresonds to AeroCom default) #chk_unit = not harmonise_units grid_stat.remove_outliers(var, low=low, high=high, check_unit=True) _df = _colocate_site_data_helper( stat_data=grid_stat, stat_data_ref=obs_stat, var=var, var_ref=var_ref, ts_type=col_freq, resample_how=resample_how, apply_time_resampling_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, use_climatology_ref=use_climatology_ref) # this try/except block was introduced on 23/2/2021 as temporary fix from # v0.10.0 -> v0.10.1 as a result of multi-weekly obsdata (EBAS) that # can end up resulting in incorrect number of timestamps after resampling # (the error was discovered using EBASMC, concpm10, 2019 and colocation # frequency monthly) try: # assign the unified timeseries data to the colocated data array coldata[0, :, i] = _df['ref'].values coldata[1, :, i] = _df['data'].values except ValueError as e: const.print_log.warning( f'Failed to colocate time for station {obs_stat.station_name}. ' f'This station will be skipped (error: {e})') lons.append(obs_stat.longitude) lats.append(obs_stat.latitude) alts.append(obs_stat.altitude) station_names.append(obs_stat.station_name) try: revision = ungridded_data.data_revision[dataset_ref] except Exception: try: revision = ungridded_data._get_data_revision_helper(dataset_ref) except MetaDataError: revision = 'MULTIPLE' except Exception: revision = 'n/a' files = [os.path.basename(x) for x in gridded_data.from_files] meta = { 'data_source': [dataset_ref, gridded_data.name], 'var_name': [var_ref, var], 'ts_type': col_freq, # will be updated below if resampling 'filter_name': filter_name, 'ts_type_src': [ts_type_src_ref, grid_ts_type_src], 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'var_units': [ungridded_unit, gridded_unit], 'vert_scheme': vert_scheme, 'data_level': 3, 'revision_ref': revision, 'from_files': files, 'from_files_ref': None, 'stations_ignored': ignore_station_names, 'colocate_time': colocate_time, 'obs_is_clim': use_climatology_ref, 'pyaerocom': pya_ver, 'apply_constraints': apply_time_resampling_constraints, 'min_num_obs': min_num_obs, 'outliers_removed': remove_outliers } meta.update(regfilter.to_dict()) # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'var_units': ('data_source', meta['var_units']), 'ts_type_src': ('data_source', meta['ts_type_src']), 'time': time_idx, 'station_name': station_names, 'latitude': ('station_name', lats), 'longitude': ('station_name', lons), 'altitude': ('station_name', alts) } dims = ['data_source', 'time', 'station_name'] data = ColocatedData(data=coldata, coords=coords, dims=dims, name=var, attrs=meta) # add correct units for lat / lon dimensions data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name data.latitude.attrs['units'] = str(gridded_data.latitude.units) data.longitude.attrs[ 'standard_name'] = gridded_data.longitude.standard_name data.longitude.attrs['units'] = str(gridded_data.longitude.units) if col_freq != str(ts_type): data = data.resample_time( to_ts_type=ts_type, colocate_time=colocate_time, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how, **kwargs) return data
def resample_time_dataarray(arr, freq, how='mean', min_num_obs=None): """Resample the time dimension of a :class:`xarray.DataArray` Note ---- The dataarray must have a dimension coordinate named "time" Parameters ---------- arr : DataArray data array to be resampled freq : str new temporal resolution (can be pandas freq. string, or pyaerocom ts_type) how : str choose from mean or median min_num_obs : :obj:`int`, optional minimum number of observations required per period (when downsampling). E.g. if input is in daily resolution and freq is monthly and min_num_obs is 10, then all months that have less than 10 days of data are set to nan. Returns ------- DataArray resampled data array object Raises ------ IOError if data input `arr` is not an instance of :class:`DataArray` DataDimensionError if time dimension is not available in dataset """ if not isinstance(arr, xray.DataArray): raise IOError('Invalid input for arr: need DataArray, got {}'.format( type(arr))) elif not 'time' in arr.dims: raise DataDimensionError('Cannot resample time: input DataArray has ' 'no time dimension') from pyaerocom.tstype import TsType from pyaerocom.time_config import XARR_TIME_GROUPERS to = TsType(freq) pd_freq = to.to_pandas() invalid = None if min_num_obs is not None: if not pd_freq in XARR_TIME_GROUPERS: raise ValueError( 'Cannot infer xarray grouper for ts_type {}'.format(to.val)) gr = XARR_TIME_GROUPERS[pd_freq] # 2D mask with shape of resampled data array invalid = arr.groupby( 'time.{}'.format(gr)).count(dim='time') < min_num_obs freq, loffset = _get_pandas_freq_and_loffset(freq) arr = arr.resample(time=pd_freq, loffset=loffset).mean(dim='time') if invalid is not None: arr.data[invalid.data] = np.nan return arr
def check_time_coordOLD(cube, ts_type, year): """Method that checks the time coordinate of an iris Cube This method checks if the time dimension of a cube is accessible and according to the standard (i.e. fully usable). It only checks, and does not correct. For the latter, please see :func:`correct_time_coord`. Parameters ---------- cube : Cube cube containing data ts_type : str temporal resolution of data (e.g. "hourly", "daily"). This information is e.g. encrypted in the filename of a NetCDF file and may be accessed using :class:`pyaerocom.io.FileConventionRead` year : int interger specifying year of observation, e.g. 2017 Returns ------- bool True, if time dimension is ok, False if not """ ok = True ts_type = TsType(ts_type) test_idx = [ 0, 1, 2, 7 ] #7, since last accessible index in a 3hourly dataset of one day is 7 try: try: t = cube.coord("time") except Exception: raise AttributeError("Cube does not contain time dimension") if not isinstance(t, iris.coords.DimCoord): raise AttributeError("Time is not a DimCoord instance") try: cftime_to_datetime64(0, cfunit=t.units) except Exception: raise ValueError("Could not convert time unit string") # ============================================================================= # tres_np = TSTR_TO_NP_TD[ts_type] # conv = TSTR_TO_NP_DT[ts_type] # ============================================================================= tres_np = ts_type.timedelta64_str conv = ts_type.datetime64_str_str base = datetime64("{}-01-01 00:00:00".format(year)).astype(conv) test_datenums = asarray(test_idx) ts_nominal = base + test_datenums.astype(tres_np) dts_nominal = ts_nominal[1:] - ts_nominal[:-1] ts_values = cftime_to_datetime64(t[test_idx].points, cfunit=t.units).astype(conv) dts_values = ts_values[1:] - ts_values[:-1] if not all(ts_values == ts_nominal): raise ValueError( "Time match error, nominal dates for test array" "%s (unit=%s): %s\nReceived values after " "conversion: %s" % (test_datenums, t.units.origin, ts_nominal, ts_values)) elif not all(dts_values == dts_nominal): raise ValueError( "Time match error, time steps for test array" "%s (unit=%s): %s\nReceived values after " "conversion: %s" % (test_datenums, t.units.origin, dts_nominal, dts_values)) except Exception as e: logger.warning("Invalid time dimension.\n" "Error message: {}".format(repr(e))) ok = False return ok
def test_to_pandas_freq(): assert TsType('3hourly').to_pandas_freq() == '3H' assert TsType('daily').to_pandas_freq() == 'D'
def resample(self, to_ts_type, input_data=None, from_ts_type=None, how='mean', apply_constraints=False, min_num_obs=None, **kwargs): """Resample input data Parameters ---------- input_data : pandas.Series or xarray.DataArray data to be resampled to_ts_type : str or pyaerocom.tstype.TsType output resolution how : str string specifying how the data is to be aggregated, default is mean apply_constraints : bool, optional if True, hierarchical resampling is applied using input `samping_constraints` (if provided) or else, using constraints specified in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE` min_num_obs : dict or int, optinal integer or nested dictionary specifying minimum number of observations required to resample from higher to lower frequency. For instance, if `input_data` is hourly and `to_ts_type` is monthly, you may specify something like:: min_num_obs = {'monthly' : {'daily' : 7}, 'daily' : {'hourly' : 6}} to require at least 6 hours per day and 7 days per month. Or, if data is daily and output is monthly and **kwargs additional input arguments passed to resampling method Returns ------- pandas.Series or xarray.DataArray resampled data object """ if not isinstance(to_ts_type, TsType): to_ts_type = TsType(to_ts_type) if not to_ts_type.val in self.FREQS_SUPPORTED: raise NotImplementedError('Cannot resample to input frequency ' '{}. Choose from: {}' .format(to_ts_type, self.FREQS_SUPPORTED.keys())) if input_data is not None: self.input_data = input_data if self.input_data is None: raise ValueError('Please provide data (Series or DataArray)') if apply_constraints is None: apply_constraints = self.APPLY_CONSTRAINTS if not apply_constraints: self.last_setup = dict(apply_constraints=False, min_num_obs=None) return self.fun(self.input_data, freq=to_ts_type.val, how=how, **kwargs) elif from_ts_type is None: const.print_log.warn('Cannot apply time resampling constraints, ' 'since input from_ts_type is None. Applying ' 'resampling to {} without any constraints' .format(to_ts_type)) self.last_setup = dict(apply_constraints=False, min_num_obs=None) return self.fun(self.input_data, freq=to_ts_type.val, how=how, **kwargs) if isinstance(from_ts_type, str): from_ts_type = TsType(from_ts_type) if not isinstance(from_ts_type, TsType): raise ValueError('Invalid input for from_ts_type: {}. Need valid ' 'str or TsType. Input arg from_ts_type is ' 'required if resampling using hierarchical ' 'constraints (arg apply_constraints) is activated' .format(from_ts_type)) if to_ts_type > from_ts_type: raise TemporalResolutionError('Cannot resample time-series from {} ' 'to {}' .format(from_ts_type, to_ts_type)) elif to_ts_type == from_ts_type: const.logger.info('Input time frequency equals current frequency ' 'of data, ignoring any resampling constraints') self.last_setup = dict(apply_constraints=False, min_num_obs=None) return self.fun(self.input_data, freq=to_ts_type.val, how=how, **kwargs) if min_num_obs is None: min_num_obs = self.SAMPLING_CONSTRAINTS _idx = self._gen_idx(from_ts_type, to_ts_type, min_num_obs) data = self.input_data for to_ts_type, mno in _idx: data = self.fun(data, freq=to_ts_type, how=how, min_num_obs=mno) self.last_setup = dict(apply_constraints=True, min_num_obs=min_num_obs) return data
def colocate_gridded_gridded(gridded_data, gridded_data_ref, ts_type=None, start=None, stop=None, filter_name=None, regrid_res_deg=None, remove_outliers=True, vert_scheme=None, harmonise_units=True, regrid_scheme='areaweighted', var_outlier_ranges=None, var_ref_outlier_ranges=None, update_baseyear_gridded=None, apply_time_resampling_constraints=None, min_num_obs=None, colocate_time=False, var_keep_outliers=True, var_ref_keep_outliers=False, resample_how=None, **kwargs): """Colocate 2 gridded data objects Todo ---- - think about vertical dimension (vert_scheme input not used at the moment) Parameters ---------- gridded_data : GriddedData gridded data (e.g. model results) gridded_data_ref : GriddedData reference dataset that is used to evaluate :attr:`gridded_data` (e.g. gridded observation data) ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly..) start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). If None, then it is set to 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude mountain sites. regrid_res_deg : int or dict, optional regrid resolution in degrees. If specified, the input gridded data objects will be regridded in lon / lat dimension to the input resolution (if input is integer, both lat and lon are regridded to that resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg` to specify regrid resolutions, respectively). remove_outliers : bool if True, outliers are removed from model and obs data before colocation, else not. vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. harmonise_units : bool if True, units are attempted to be harmonised (note: raises Exception if True and units cannot be harmonised). regrid_scheme : str iris scheme used for regridding (defaults to area weighted regridding) var_outlier_ranges : :obj:`dict`, optional dictionary specifying outlier ranges for dataset to be analysed (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then the pyaerocom default outlier ranges are used for the input variable. Defaults to None. var_ref_outlier_ranges : dict, optional like `var_outlier_ranges` but for reference dataset. update_baseyear_gridded : int, optional optional input that can be set in order to redefine the time dimension in the gridded data object to be analysed. E.g., if the data object is a climatology (one year of data) that has set the base year of the time dimension to a value other than the specified input start / stop time this may be used to update the time in order to make colocation possible. apply_time_resampling_constraints : bool, optional if True, then time resampling constraints are applied as provided via :attr:`min_num_obs` or if that one is unspecified, as defined in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used (which defaults to True !!). min_num_obs : int or dict, optional minimum number of observations for resampling of time colocate_time : bool if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. var_keep_outliers : bool if True, then no outliers will be removed from dataset to be analysed, even if `remove_outliers` is True. That is because for model evaluation often only outliers are supposed to be removed in the observations but not in the model. var_ref_keep_outliers : bool if True, then no outliers will be removed from the reference dataset, even if `remove_outliers` is True. resample_how : str or dict string specifying how data should be aggregated when resampling in time. Default is "mean". Can also be a nested dictionary, e.g. resample_how={'daily': {'hourly' : 'max'}} would use the maximum value to aggregate from hourly to daily, rather than the mean. **kwargs additional keyword args (not used here, but included such that factory class can handle different methods with different inputs) Returns ------- ColocatedData instance of colocated data """ if vert_scheme is not None: raise NotImplementedError( f'This type of colocation is not implemented ' f'for gridded / gridded colocation... ({vert_scheme})') if var_outlier_ranges is None: var_outlier_ranges = {} if var_ref_outlier_ranges is None: var_ref_outlier_ranges = {} if filter_name is None: filter_name = const.DEFAULT_REG_FILTER if harmonise_units and gridded_data.var_info.has_unit: if not gridded_data.units == gridded_data_ref.units: try: gridded_data_ref.convert_unit(gridded_data.units) except Exception: raise DataUnitError('Failed to merge data unit of reference ' 'gridded data object ({}) to data unit ' 'of gridded data object ({})'.format( gridded_data.units, gridded_data_ref.units)) var, var_ref = gridded_data.var_name, gridded_data_ref.var_name aerocom_var = gridded_data.var_name_aerocom _check_var_registered(var, aerocom_var, gridded_data) if remove_outliers: low, high, low_ref, high_ref = None, None, None, None if var in var_outlier_ranges: low, high = var_outlier_ranges[var] if var_ref in var_ref_outlier_ranges: low_ref, high_ref = var_ref_outlier_ranges[var_ref] if not var_keep_outliers: gridded_data.remove_outliers(low, high, inplace=True) if not var_ref_keep_outliers: gridded_data_ref.remove_outliers(low_ref, high_ref, inplace=True) if update_baseyear_gridded is not None: # update time dimension in gridded data gridded_data.base_year = update_baseyear_gridded if regrid_res_deg is not None: gridded_data_ref = _regrid_gridded(gridded_data_ref, regrid_scheme, regrid_res_deg) # perform regridding if gridded_data.lon_res < gridded_data_ref.lon_res: #obs has lower resolution gridded_data = gridded_data.regrid(gridded_data_ref, scheme=regrid_scheme) else: gridded_data_ref = gridded_data_ref.regrid(gridded_data, scheme=regrid_scheme) # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) grid_start_ref = to_pandas_timestamp(gridded_data_ref.start) grid_stop_ref = to_pandas_timestamp(gridded_data_ref.stop) # time resolution of dataset to be analysed grid_ts_type = grid_ts_type_src = gridded_data.ts_type ref_ts_type = ref_ts_type_src = gridded_data_ref.ts_type if ref_ts_type != grid_ts_type: # ref data is in higher resolution if TsType(ref_ts_type) > TsType(grid_ts_type): gridded_data_ref = gridded_data_ref.resample_time( grid_ts_type, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how) else: gridded_data = gridded_data.resample_time( ref_ts_type, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how) grid_ts_type = ref_ts_type # now both are in same temporal resolution # input ts_type is not specified or model is in lower resolution # than input ts_type -> use model frequency to colocate if ts_type is None or TsType(grid_ts_type) < TsType(ts_type): ts_type = grid_ts_type if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) if grid_start_ref > start: start = grid_start_ref if grid_stop_ref < stop: stop = grid_stop_ref # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) gridded_data = gridded_data.crop(time_range=(start, stop)) gridded_data_ref = gridded_data_ref.crop(time_range=(start, stop)) # perform region extraction (if applicable) regfilter = Filter(name=filter_name) gridded_data = regfilter(gridded_data) gridded_data_ref = regfilter(gridded_data_ref) if not gridded_data.shape == gridded_data_ref.shape: raise ColocationError('Shape mismatch between two colocated data ' 'arrays, please debug') files_ref = [os.path.basename(x) for x in gridded_data_ref.from_files] files = [os.path.basename(x) for x in gridded_data.from_files] meta = { 'data_source': [gridded_data_ref.data_id, gridded_data.data_id], 'var_name': [var_ref, var], 'ts_type': grid_ts_type, 'filter_name': filter_name, 'ts_type_src': [ref_ts_type_src, grid_ts_type_src], 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'var_units': [str(gridded_data_ref.units), str(gridded_data.units)], 'vert_scheme': vert_scheme, 'data_level': 3, 'revision_ref': gridded_data_ref.data_revision, 'from_files': files, 'from_files_ref': files_ref, 'colocate_time': colocate_time, 'obs_is_clim': False, 'pyaerocom': pya_ver, 'apply_constraints': apply_time_resampling_constraints, 'min_num_obs': min_num_obs } meta.update(regfilter.to_dict()) data = gridded_data.grid.data if isinstance(data, np.ma.core.MaskedArray): data = data.filled(np.nan) data_ref = gridded_data_ref.grid.data if isinstance(data_ref, np.ma.core.MaskedArray): data_ref = data_ref.filled(np.nan) arr = np.asarray((data_ref, data)) time = gridded_data.time_stamps().astype('datetime64[ns]') lats = gridded_data.latitude.points lons = gridded_data.longitude.points # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'var_units': ('data_source', meta['var_units']), 'ts_type_src': ('data_source', meta['ts_type_src']), 'time': time, 'latitude': lats, 'longitude': lons } dims = ['data_source', 'time', 'latitude', 'longitude'] data = ColocatedData(data=arr, coords=coords, dims=dims, name=gridded_data.var_name, attrs=meta) # add correct units for lat / lon dimensions data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name data.latitude.attrs['units'] = str(gridded_data.latitude.units) data.longitude.attrs[ 'standard_name'] = gridded_data.longitude.standard_name data.longitude.attrs['units'] = str(gridded_data.longitude.units) if grid_ts_type != ts_type: data = data.resample_time( to_ts_type=ts_type, colocate_time=colocate_time, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how, **kwargs) return data
def test_cf_base_unit(): assert TsType('daily').cf_base_unit == 'days' assert TsType('monthly').cf_base_unit == 'days' assert TsType('hourly').cf_base_unit == 'hours'
def check_time_coord(cube, ts_type, year): """Method that checks the time coordinate of an iris Cube This method checks if the time dimension of a cube is accessible and according to the standard (i.e. fully usable). It only checks, and does not correct. For the latter, please see :func:`correct_time_coord`. Parameters ---------- cube : Cube cube containing data ts_type : str pyaerocom ts_type year : year of data Returns ------- bool True, if time dimension is ok, False if not """ if isinstance(ts_type, str): ts_type = TsType(ts_type) try: t = cube.coord("time") except Exception: raise AttributeError("Cube does not contain time dimension") if not isinstance(t, iris.coords.DimCoord): raise AttributeError("Time is not a DimCoord instance") try: cftime_to_datetime64(0, cfunit=t.units) except Exception: raise ValueError("Could not convert time unit string") freq = ts_type.to_pandas_freq() tidx = make_datetimeindex_from_year(freq, year) num_per = len(tidx) num = len(t.points) if not num == num_per: if tidx[0].is_leap_year: if not _check_leap_year(num, num_per, ts_type): raise UnresolvableTimeDefinitionError( 'Expected {} timestamps but ' 'data has {}'.format(len(tidx), num)) else: raise UnresolvableTimeDefinitionError('Expected {} timestamps but ' 'data has {}'.format( len(tidx), num)) # ToDo: check why MS is not working for period conversion if freq == 'MS': freq = 'M' # convert first and last timestamps of index array into periods # (e.g. January and December for monthly data) per0 = tidx[0].to_period(freq) per1 = tidx[-1].to_period(freq) # first and last timestamp in data t0, t1 = cftime_to_datetime64([t.points[0], t.points[-1]], cfunit=t.units) if not per0.start_time <= t0 <= per0.end_time: raise ValueError('First timestamp of data {} does not lie in first ' 'period: {}'.format(t0, per0)) elif not per1.start_time <= t1 <= per1.end_time: raise ValueError('Last timestamp of data {} does not lie in last ' 'period: {}'.format(t1, per1))
def test_to_numpy_freq(): assert TsType('3hourly').to_numpy_freq() == '3h' assert TsType('daily').to_numpy_freq() == '1D'
def _run_gridded_ungridded(self, var_name=None): """Analysis method for gridded vs. ungridded data""" print_log.info('PREPARING colocation of {} vs. {}'.format( self.model_id, self.obs_id)) model_reader = self.instantiate_gridded_reader(what='model') obs_reader = ReadUngridded(self.obs_id, data_dir=self.obs_data_dir) obs_vars = obs_reader.get_vars_supported(self.obs_id, self.obs_vars) if len(obs_vars) == 0: raise DataCoverageError( 'No observation variable matches found for ' '{}'.format(self.obs_id)) var_matches = self._find_var_matches(obs_vars, model_reader, var_name) print_log.info( 'The following variable combinations will be colocated\n' 'MODEL-VAR\tOBS-VAR') for key, val in var_matches.items(): print_log.info('{}\t{}'.format(key, val)) # get list of unique observation variables obs_vars = np.unique(list(var_matches.values())).tolist() if self.remove_outliers: self._update_var_outlier_ranges(var_matches) if self.read_opts_ungridded is not None: ropts = self.read_opts_ungridded else: ropts = {} data_objs = {} if self.start is None: self._infer_start_stop(model_reader) start, stop = start_stop(self.start, self.stop) for model_var, obs_var in var_matches.items(): # ToDo: consider removing outliers already here. #if 'obs_filters' in self: ts_type = self.ts_type print_log.info('Running {} / {} ({}, {})'.format( self.model_id, self.obs_id, model_var, obs_var)) try: model_data = self._read_gridded(reader=model_reader, var_name=model_var, start=start, stop=stop, is_model=True) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue ts_type_src = model_data.ts_type rshow = self._eval_resample_how(model_var, obs_var) if ts_type is None: # if colocation frequency is not specified ts_type = ts_type_src ignore_stats = None if self.ignore_station_names is not None: ignore_stats = self.ignore_station_names if isinstance(ignore_stats, dict): if obs_var in ignore_stats: ignore_stats = ignore_stats[obs_var] else: ignore_stats = None #ts_type_src = model_data.ts_type if TsType(ts_type_src) < TsType( ts_type): # < all_ts_types.index(ts_type_src): print_log.info('Updating ts_type from {} to {} (highest ' 'available in model {})'.format( ts_type, ts_type_src, self.model_id)) ts_type = ts_type_src really_do_reanalysis = True if self.save_coldata: really_do_reanalysis = False savename = self._coldata_savename(model_data, start, stop, ts_type, var_name=model_var) file_exists = self._check_coldata_exists( model_data.data_id, savename) out_dir = chk_make_subdir(self.basedir_coldata, self.model_id) if file_exists: if not self.reanalyse_existing: if self._log: self._write_log('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) self.file_status[savename] = 'skipped' continue else: really_do_reanalysis = True print_log.info( 'Deleting and recomputing existing ' 'colocated data file {}'.format(savename)) print_log.info('REMOVE: {}\n'.format(savename)) os.remove(os.path.join(out_dir, savename)) else: really_do_reanalysis = True if really_do_reanalysis: #Reading obs data only if the co-located data file does #not already exist. #This part of the method has been changed by @hansbrenna to work better with #large observational data sets. Only one variable is loaded into # the UngriddedData object at a time. Currently the variable is #re-read a lot of times, which is a weakness. obs_data = obs_reader.read(vars_to_retrieve=obs_var, only_cached=self._obs_cache_only, **ropts) # ToDo: consider removing outliers already here. if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() obs_data = obs_data.apply_filters(**remaining_filters) try: try: by = self.update_baseyear_gridded stop = None except AttributeError: by = None if self.model_use_climatology: by = start.year coldata = colocate_gridded_ungridded( gridded_data=model_data, ungridded_data=obs_data, ts_type=ts_type, start=start, stop=stop, var_ref=obs_var, filter_name=self.filter_name, regrid_res_deg=self.regrid_res_deg, remove_outliers=self.remove_outliers, vert_scheme=self.vert_scheme, harmonise_units=self.harmonise_units, var_outlier_ranges=self.var_outlier_ranges, var_ref_outlier_ranges=self.var_ref_outlier_ranges, update_baseyear_gridded=by, ignore_station_names=ignore_stats, apply_time_resampling_constraints=self. apply_time_resampling_constraints, min_num_obs=self.min_num_obs, colocate_time=self.colocate_time, var_keep_outliers=self.model_keep_outliers, var_ref_keep_outliers=self.obs_keep_outliers, use_climatology_ref=self.obs_use_climatology, resample_how=rshow) if self.model_to_stp: coldata = correct_model_stp_coldata(coldata) if self.save_coldata: self._save_coldata(coldata, savename, out_dir, model_var, model_data, obs_var) data_objs[model_var] = coldata except Exception: msg = ('Colocation between model {} / {} and obs {} / {} ' 'failed.\nTraceback:\n{}'.format( self.model_id, model_var, self.obs_id, obs_var, traceback.format_exc())) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) return data_objs