def __init__(self, reader, vars_to_retrieve=None, start=None, stop=None, **kwargs): self.reader = reader self.vars_to_retrieve = vars_to_retrieve self._start = start self._stop = stop self.loaded_data = None #: Flag that is set True if pyaerocom server can be accessed. If this #: is the case, potentially pickled files are checked against latest #: revision and file available in database, else, pickled match is #: loaded without doublechecking whether the database has changed self.connection_established = False self.newest_file_in_read_dir = None self.newest_file_date_in_read_dir = None try:# get latest file in data directory of obs network ... newest = max(glob.iglob(os.path.join(self.data_dir, '*')), key=os.path.getctime) self.newest_file_in_read_dir = newest # ... and corresponding file date self.newest_file_date_in_read_dir = os.path.getctime(newest) self.connection_established = True except IOError as e: logger.exception('Failed to establish connection with Aerocom ' 'server. Error: {}'.format(repr(e)))
def decode(self): flags = np.zeros((len(self.raw_data), 3)).astype(int) mask = self.raw_data.astype(bool) not_ok = self.raw_data[mask] if len(not_ok) > 0: decoded = [] for flag in not_ok: item = "{:.9f}".format(flag).split(".")[1] decoded.append([int(item[:3]), int(item[3:6]), int(item[6:9])]) try: flags[mask] = np.asarray(decoded) except: logger.exception('Failed to decode flag') self.flags = flags
def write(self, data): """Write instance of UngriddedData to cache Parameters ---------- data : UngriddedData object containing the data """ if not self.connection_established: # TODO: may be updated in the future raise AerocomConnectionError('Cannot write Cache file, connection ' 'to Aerocom database could not be ' 'established (required for checking ' 'revision)') if not isinstance(data, UngriddedData): raise TypeError('Invalid input, need instance of UngriddedData, ' 'got {}'.format(type(data))) logger.info('Writing cache file: {}'.format(self.file_path)) success=True # OutHandle = gzip.open(c__cache_file, 'wb') # takes too much time out_handle = open(self.file_path, 'wb') try: pickle.dump(self.newest_file_in_read_dir, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(self.newest_file_date_in_read_dir, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(self.reader.data_revision, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(self.reader.__version__, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(UngriddedData.__version__, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(self.__version__, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(data, out_handle, pickle.HIGHEST_PROTOCOL) except: logger.exception('Failed to write cache') success=False finally: out_handle.close() if not success: os.remove(self.file_path) logger.info('Success!')
def check_and_load(self): if not os.path.isfile(self.file_path): logger.info('No cache file available for query of dataset ' '{}'.format(self.dataset_to_read)) return False delete_existing = False in_handle = open(self.file_path, 'rb') # read meta information about file if self.connection_established: try: use_cache_file = self._check_pkl_head_vs_database(in_handle) except Exception as e: use_cache_file = False delete_existing = True logger.exception('File error in cached data file {}. File will ' 'be removed and data reloaded' 'Error: {}'.format(self.file_path, repr(e))) if not use_cache_file: # TODO: Should we delete the cache file if it is outdated ??? logger.info('Aborting reading cache file {}. Aerocom database ' 'has changed compared to cached version' .format(self.file_name)) in_handle.close() if delete_existing: #something was wrong os.remove(self.file_path) return False else: for k in range(self.LEN_CACHE_HEAD): logger.debug(pickle.load(in_handle)) # everything is okay data = pickle.load(in_handle) if not isinstance(data, UngriddedData): raise TypeError('Unexpected data type stored in cache file, need ' 'instance of UngriddedData, got {}'.format(type(data))) self.loaded_data = data logger.info('Successfully loaded data for {} from Cache'.format(self.dataset_to_read)) return True
def colocate_gridded_ungridded(gridded_data, ungridded_data, ts_type=None, start=None, stop=None, filter_name=None, regrid_res_deg=None, remove_outliers=True, vert_scheme=None, harmonise_units=True, var_ref=None, var_outlier_ranges=None, var_ref_outlier_ranges=None, update_baseyear_gridded=None, ignore_station_names=None, apply_time_resampling_constraints=None, min_num_obs=None, colocate_time=False, var_keep_outliers=True, var_ref_keep_outliers=False, **kwargs): """Colocate gridded with ungridded data Note ---- Uses the variable that is contained in input :class:`GriddedData` object (since these objects only contain a single variable) Parameters ---------- gridded_data : GriddedData gridded data (e.g. model results) ungridded_data : UngriddedData ungridded data (e.g. observations) ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly.). The colocation itself is done in the highest available resolution and resampling to `ts_type` is done afterwards. You may change this behaviour by setting input param `resample_first=True` (default is False). start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). If None, then it is set to 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude mountain sites. regrid_res_deg : :obj:`int`, optional regrid resolution in degrees. If specified, the input gridded data object will be regridded in lon / lat dimension to the input resolution. (BETA feature) remove_outliers : bool if True, outliers are removed from model and obs data before colocation, else not. vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. harmonise_units : bool if True, units are attempted to be harmonised (note: raises Exception if True and units cannot be harmonised). var_ref : :obj:`str`, optional variable against which data in :attr:`gridded_data` is supposed to be compared. If None, then the same variable is used (i.e. `gridded_data.var_name`). var_outlier_ranges : dict, optional dictionary specifying outlier ranges for dataset to be analysed (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then the pyaerocom default outlier ranges are used for the input variable. Defaults to None. var_ref_outlier_ranges : dict, optional like `var_outlier_ranges` but for reference dataset. update_baseyear_gridded : int, optional optional input that can be set in order to re-define the time dimension in the gridded data object to be analysed. E.g., if the data object is a climatology (one year of data) that has set the base year of the time dimension to a value other than the specified input start / stop time this may be used to update the time in order to make colocation possible. ignore_station_names : str or list, optional station name or pattern or list of station names or patterns that should be ignored apply_time_resampling_constraints : bool, optional if True, then time resampling constraints are applied as provided via :attr:`min_num_obs` or if that one is unspecified, as defined in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used (which defaults to True !!). min_num_obs : int or dict, optional minimum number of observations for resampling of time colocate_time : bool if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. var_keep_outliers : bool if True, then no outliers will be removed from dataset to be analysed, even if `remove_outliers` is True. That is because for model evaluation often only outliers are supposed to be removed in the observations but not in the model. var_ref_keep_outliers : bool if True, then no outliers will be removed from the reference dataset, even if `remove_outliers` is True. **kwargs additional keyword args (passed to :func:`UngriddedData.to_station_data_all`) Returns ------- ColocatedData instance of colocated data Raises ------ VarNotAvailableError if grid data variable is not available in ungridded data object AttributeError if instance of input :class:`UngriddedData` object contains more than one dataset TimeMatchError if gridded data time range does not overlap with input time range ColocationError if none of the data points in input :class:`UngriddedData` matches the input colocation constraints """ if var_outlier_ranges is None: var_outlier_ranges = {} if var_ref_outlier_ranges is None: var_ref_outlier_ranges = {} if filter_name is None: filter_name = 'WORLD-wMOUNTAINS' var = gridded_data.var_name aerocom_var = gridded_data.var_name_aerocom if var_ref is None: var_ref = aerocom_var if remove_outliers: low, high, low_ref, high_ref = None, None, None, None if var in var_outlier_ranges: low, high = var_outlier_ranges[var] if var_ref in var_ref_outlier_ranges: low_ref, high_ref = var_ref_outlier_ranges[var_ref] if not var_ref in ungridded_data.contains_vars: raise VarNotAvailableError('Variable {} is not available in ungridded ' 'data (which contains {})'.format( var_ref, ungridded_data.contains_vars)) elif len(ungridded_data.contains_datasets) > 1: raise AttributeError('Colocation can only be performed with ' 'ungridded data objects that only contain a ' 'single dataset. Use method `extract_dataset` of ' 'UngriddedData object to extract single datasets') dataset_ref = ungridded_data.contains_datasets[0] if update_baseyear_gridded is not None: # update time dimension in gridded data gridded_data.base_year = update_baseyear_gridded # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) grid_ts_type = gridded_data.ts_type if ts_type is None: ts_type = grid_ts_type if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) if start < grid_start: start = grid_start if stop > grid_stop: stop = grid_stop # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) # create instance of Filter class (may, in the future, also include all # filter options, e.g. start, stop, variables, only land, only oceans, and # may also be linked with other data object, e.g. if data is only supposed # to be used if other data object exceeds a certain threshold... but for # now, only region and altitude range) regfilter = Filter(name=filter_name) # apply filter to data ungridded_data = regfilter(ungridded_data) #crop time gridded_data = gridded_data.crop(time_range=(start, stop)) if regrid_res_deg is not None: lons = gridded_data.longitude.points lats = gridded_data.latitude.points lons_new = np.arange(lons.min(), lons.max(), regrid_res_deg) lats_new = np.arange(lats.min(), lats.max(), regrid_res_deg) gridded_data = gridded_data.interpolate(latitude=lats_new, longitude=lons_new) ungridded_freq = None # that keeps ungridded data in original resolution if not colocate_time: gridded_data = gridded_data.resample_time(to_ts_type=ts_type) ungridded_freq = ts_type # converts ungridded data directly to desired resolution # ts_type that is used for colocation col_ts_type = gridded_data.ts_type # pandas frequency string that corresponds to col_ts_type col_freq = TS_TYPE_TO_PANDAS_FREQ[col_ts_type] if remove_outliers and not var_ref_keep_outliers: ungridded_data.remove_outliers(var_ref, inplace=True, low=low_ref, high=high_ref) all_stats = ungridded_data.to_station_data_all( vars_to_convert=var_ref, start=start, stop=stop, freq=ungridded_freq, by_station_name=True, ignore_index=ignore_station_names, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, **kwargs) obs_stat_data = all_stats['stats'] ungridded_lons = all_stats['longitude'] ungridded_lats = all_stats['latitude'] # resampling constraints may have been altered in case input was None, # thus overwrite vi = obs_stat_data[0]['var_info'][var_ref] if 'apply_constraints' in vi: apply_time_resampling_constraints = vi['apply_constraints'] min_num_obs = vi['min_num_obs'] if len(obs_stat_data) == 0: raise VarNotAvailableError('Variable {} is not available in specified ' 'time interval ({}-{})'.format( var_ref, start, stop)) # make sure the gridded data is in the right dimension try: gridded_data.check_dimcoords_tseries() except DimensionOrderError: gridded_data.reorder_dimensions_tseries() if gridded_data.ndim > 3: if vert_scheme is None: vert_scheme = 'mean' if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES: raise ValueError( 'Vertical scheme {} is not supported'.format(vert_scheme)) grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons, latitude=ungridded_lats, vert_scheme=vert_scheme) # Generate time index of ColocatedData object time_idx = pd.DatetimeIndex(start=start, end=stop, freq=col_freq) #periods = time_idx.to_period(col_freq) # ============================================================================= # if col_freq in PANDAS_RESAMPLE_OFFSETS: # offs = np.timedelta64(1, '[{}]'.format(PANDAS_RESAMPLE_OFFSETS[col_freq])) # time_idx = time_idx + offs # ============================================================================= coldata = np.empty((2, len(time_idx), len(obs_stat_data))) lons = [] lats = [] alts = [] station_names = [] ungridded_unit = None ts_type_src_ref = None if not harmonise_units: gridded_unit = str(gridded_data.units) else: gridded_unit = None # loop over all stations and append to colocated data object for i, obs_stat in enumerate(obs_stat_data): if ts_type_src_ref is None: ts_type_src_ref = obs_stat['ts_type_src'] elif obs_stat['ts_type_src'] != ts_type_src_ref: spl = ts_type_src_ref.split(';') if not obs_stat['ts_type_src'] in spl: spl.append(obs_stat['ts_type_src']) ts_type_src_ref = ';'.join(spl) if ungridded_unit is None: try: ungridded_unit = obs_stat['var_info'][var_ref]['units'] except KeyError as e: #variable information or unit is not defined logger.exception(repr(e)) try: unit = obs_stat['var_info'][var_ref]['units'] except: unit = None if not unit == ungridded_unit: raise ValueError( 'Cannot perform colocation. Ungridded data ' 'object contains different units ({})'.format(var_ref)) # get observations (Note: the index of the observation time series # is already in the specified frequency format, and thus, does not # need to be updated, for details (or if errors occur), cf. # UngriddedData.to_station_data, where the conversion happens) # get model data corresponding to station grid_stat = grid_stat_data[i] if harmonise_units: grid_unit = grid_stat.get_unit(var) obs_unit = obs_stat.get_unit(var_ref) if not grid_unit == obs_unit: grid_stat.convert_unit(var, obs_unit) if gridded_unit is None: gridded_unit = obs_unit if remove_outliers and not var_keep_outliers: # don't check if harmonise_units is active, because the # remove_outliers method checks units based on AeroCom default # variables, and a variable mapping might be active, i.e. # sometimes models use abs550aer for absorption coefficients # with units [m-1] and not for AAOD (which is the AeroCom default # and unitless. Hence, unit check in remove_outliers works only # if the variable name (and unit) corresonds to AeroCom default) #chk_unit = not harmonise_units grid_stat.remove_outliers(var, low=low, high=high, check_unit=True) # get grid and obs timeseries data (that may be sampled in arbitrary # time resolution, particularly the obs data) grid_ts = grid_stat[var] obs_ts = obs_stat[var_ref] # resample to the colocation frequency obs_ts1 = obs_ts.resample(col_freq).mean() grid_ts1 = grid_ts.resample(col_freq).mean() # fill up missing time stamps _df = pd.concat([obs_ts1, grid_ts1], axis=1, keys=['o', 'm']) # assign the unified timeseries data to the colocated data array coldata[0, :, i] = _df['o'].values coldata[1, :, i] = _df['m'].values lons.append(obs_stat.longitude) lats.append(obs_stat.latitude) alts.append(obs_stat.altitude) station_names.append(obs_stat.station_name) try: revision = ungridded_data.data_revision[dataset_ref] except: try: revision = ungridded_data._get_data_revision_helper(dataset_ref) except MetaDataError: revision = 'MULTIPLE' except: revision = 'n/a' files = [os.path.basename(x) for x in gridded_data.from_files] meta = { 'data_source': [dataset_ref, gridded_data.name], 'var_name': [var_ref, var], 'ts_type': col_ts_type, 'filter_name': filter_name, 'ts_type_src': [ts_type_src_ref, grid_ts_type], 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'var_units': [ungridded_unit, gridded_unit], 'vert_scheme': vert_scheme, 'data_level': 3, 'revision_ref': revision, 'from_files': files, 'from_files_ref': None, 'stations_ignored': ignore_station_names, 'colocate_time': colocate_time, 'apply_constraints': apply_time_resampling_constraints, 'min_num_obs': min_num_obs, 'outliers_removed': remove_outliers } meta.update(regfilter.to_dict()) # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'var_units': ('data_source', meta['var_units']), 'ts_type_src': ('data_source', meta['ts_type_src']), 'time': time_idx, 'station_name': station_names, 'latitude': ('station_name', lats), 'longitude': ('station_name', lons), 'altitude': ('station_name', alts) } dims = ['data_source', 'time', 'station_name'] data = ColocatedData(data=coldata, coords=coords, dims=dims, name=var, attrs=meta) if colocate_time and grid_ts_type != ts_type: data = data.resample_time( to_ts_type=ts_type, colocate_time=True, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, **kwargs) return data
def colocate_gridded_ungridded(gridded_data, ungridded_data, ts_type=None, start=None, stop=None, filter_name=None, regrid_res_deg=None, remove_outliers=True, vert_scheme=None, harmonise_units=True, regrid_scheme='areaweighted', var_ref=None, var_outlier_ranges=None, var_ref_outlier_ranges=None, update_baseyear_gridded=None, ignore_station_names=None, apply_time_resampling_constraints=None, min_num_obs=None, colocate_time=False, var_keep_outliers=True, var_ref_keep_outliers=False, use_climatology_ref=False, resample_how=None, **kwargs): """Colocate gridded with ungridded data (low level method) For high-level colocation see :class:`pyaerocom.colocation_auto.Colocator` and :class:`pyaerocom.colocation_auto.ColocationSetup` Note ---- Uses the variable that is contained in input :class:`GriddedData` object (since these objects only contain a single variable). If this variable is not contained in observation data (or contained but using a different variable name) you may specify the obs variable to be used via input arg `var_ref` Parameters ---------- gridded_data : GriddedData gridded data object (e.g. model results). ungridded_data : UngriddedData ungridded data object (e.g. observations). ts_type : str desired temporal resolution of colocated data (must be valid AeroCom ts_type str such as daily, monthly, yearly.). start : :obj:`str` or :obj:`datetime64` or similar, optional start time for colocation, if None, the start time of the input :class:`GriddedData` object is used. stop : :obj:`str` or :obj:`datetime64` or similar, optional stop time for colocation, if None, the stop time of the input :class:`GriddedData` object is used filter_name : str string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for details). If None, then it is set to 'WORLD-wMOUNTAINS', which corresponds to no filtering (world with mountains). Use WORLD-noMOUNTAINS to exclude mountain sites. regrid_res_deg : int or dict, optional regrid resolution in degrees. If specified, the input gridded data object will be regridded in lon / lat dimension to the input resolution (if input is integer, both lat and lon are regridded to that resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg` to specify regrid resolutions, respectively). remove_outliers : bool if True, outliers are removed from model and obs data before colocation, else not. Outlier ranges can be specified via input args `var_outlier_ranges` and `var_ref_outlier_ranges`. vert_scheme : str string specifying scheme used to reduce the dimensionality in case input grid data contains vertical dimension. Example schemes are `mean, surface, altitude`, for details see :func:`GriddedData.to_time_series`. harmonise_units : bool if True, units are attempted to be harmonised (note: raises Exception if True and units cannot be harmonised). var_ref : :obj:`str`, optional variable against which data in :attr:`gridded_data` is supposed to be compared. If None, then the same variable is used (i.e. `gridded_data.var_name`). var_outlier_ranges : dict, optional dictionary specifying outlier ranges for dataset to be analysed (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then the pyaerocom default outlier ranges are used for the input variable. Defaults to None. var_ref_outlier_ranges : dict, optional like `var_outlier_ranges` but for reference dataset. update_baseyear_gridded : int, optional optional input that can be set in order to re-define the time dimension in the gridded data object to be analysed. E.g., if the data object is a climatology (one year of data) that has set the base year of the time dimension to a value other than the specified input start / stop time this may be used to update the time in order to make colocation possible. ignore_station_names : str or list, optional station name or pattern or list of station names or patterns that should be ignored apply_time_resampling_constraints : bool, optional if True, then time resampling constraints are applied as provided via :attr:`min_num_obs` or if that one is unspecified, as defined in :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used (which defaults to True !!). min_num_obs : int or dict, optional minimum number of observations for resampling of time colocate_time : bool if True and if original time resolution of data is higher than desired time resolution (`ts_type`), then both datasets are colocated in time *before* resampling to lower resolution. var_keep_outliers : bool if True, then no outliers will be removed from dataset to be analysed, even if `remove_outliers` is True. That is because for model evaluation often only outliers are supposed to be removed in the observations but not in the model. var_ref_keep_outliers : bool if True, then no outliers will be removed from the reference dataset, even if `remove_outliers` is True. use_climatology_ref : bool if True, climatological timeseries are used from observations resample_how : str or dict string specifying how data should be aggregated when resampling in time. Default is "mean". Can also be a nested dictionary, e.g. resample_how={'daily': {'hourly' : 'max'}} would use the maximum value to aggregate from hourly to daily, rather than the mean. **kwargs additional keyword args (passed to :func:`UngriddedData.to_station_data_all`) Returns ------- ColocatedData instance of colocated data Raises ------ VarNotAvailableError if grid data variable is not available in ungridded data object AttributeError if instance of input :class:`UngriddedData` object contains more than one dataset TimeMatchError if gridded data time range does not overlap with input time range ColocationError if none of the data points in input :class:`UngriddedData` matches the input colocation constraints """ if var_outlier_ranges is None: var_outlier_ranges = {} if var_ref_outlier_ranges is None: var_ref_outlier_ranges = {} if filter_name is None: filter_name = const.DEFAULT_REG_FILTER try: gridded_data.check_dimcoords_tseries() except DimensionOrderError: gridded_data.reorder_dimensions_tseries() var = gridded_data.var_name aerocom_var = gridded_data.var_name_aerocom _check_var_registered(var, aerocom_var, gridded_data) if var_ref is None: if aerocom_var is not None: var_ref = aerocom_var else: var_ref = var if remove_outliers: low, high, low_ref, high_ref = None, None, None, None if var in var_outlier_ranges: low, high = var_outlier_ranges[var] if var_ref in var_ref_outlier_ranges: low_ref, high_ref = var_ref_outlier_ranges[var_ref] if not var_ref in ungridded_data.contains_vars: raise VarNotAvailableError('Variable {} is not available in ungridded ' 'data (which contains {})'.format( var_ref, ungridded_data.contains_vars)) elif len(ungridded_data.contains_datasets) > 1: raise AttributeError('Colocation can only be performed with ' 'ungridded data objects that only contain a ' 'single dataset. Use method `extract_dataset` of ' 'UngriddedData object to extract single datasets') dataset_ref = ungridded_data.contains_datasets[0] if update_baseyear_gridded is not None: # update time dimension in gridded data gridded_data.base_year = update_baseyear_gridded grid_ts_type_src = gridded_data.ts_type grid_ts_type = TsType(gridded_data.ts_type) if isinstance(ts_type, str): ts_type = TsType(ts_type) if ts_type is None or grid_ts_type < ts_type: ts_type = grid_ts_type elif grid_ts_type > ts_type and not colocate_time: gridded_data = gridded_data.resample_time( str(ts_type), apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how) grid_ts_type = ts_type # get start / stop of gridded data as pandas.Timestamp grid_start = to_pandas_timestamp(gridded_data.start) grid_stop = to_pandas_timestamp(gridded_data.stop) if start is None: start = grid_start else: start = to_pandas_timestamp(start) if stop is None: stop = grid_stop else: stop = to_pandas_timestamp(stop) if start < grid_start: start = grid_start if stop > grid_stop: stop = grid_stop # check overlap if stop < grid_start or start > grid_stop: raise TimeMatchError('Input time range {}-{} does not ' 'overlap with data range: {}-{}'.format( start, stop, grid_start, grid_stop)) # create instance of Filter class (may, in the future, also include all # filter options, e.g. start, stop, variables, only land, only oceans, and # may also be linked with other data object, e.g. if data is only supposed # to be used if other data object exceeds a certain threshold... but for # now, only region and altitude range) regfilter = Filter(name=filter_name) # apply filter to data ungridded_data = regfilter.apply(ungridded_data) #crop time gridded_data = regfilter.apply(gridded_data) if start > grid_start or stop < grid_stop: gridded_data = gridded_data.crop(time_range=(start, stop)) if regrid_res_deg is not None: gridded_data = _regrid_gridded(gridded_data, regrid_scheme, regrid_res_deg) if remove_outliers and not var_ref_keep_outliers: #called twice if used via Colocator, this should go out here ungridded_data.remove_outliers(var_ref, inplace=True, low=low_ref, high=high_ref) if use_climatology_ref: col_freq = 'monthly' obs_start = const.CLIM_START obs_stop = const.CLIM_STOP else: col_freq = str(grid_ts_type) #TS_TYPE_TO_PANDAS_FREQ[grid_ts_type] obs_start = start obs_stop = stop latitude = gridded_data.latitude.points longitude = gridded_data.longitude.points lat_range = [np.min(latitude), np.max(latitude)] lon_range = [np.min(longitude), np.max(longitude)] ungridded_data = ungridded_data.filter_by_meta(latitude=lat_range, longitude=lon_range) # get timeseries from all stations in provided time resolution # (time resampling is done below in main loop) all_stats = ungridded_data.to_station_data_all( vars_to_convert=var_ref, start=obs_start, stop=obs_stop, by_station_name=True, ignore_index=ignore_station_names, **kwargs) obs_stat_data = all_stats['stats'] ungridded_lons = all_stats['longitude'] ungridded_lats = all_stats['latitude'] if len(obs_stat_data) == 0: raise VarNotAvailableError('Variable {} is not available in specified ' 'time interval ({}-{})'.format( var_ref, start, stop)) # make sure the gridded data is in the right dimension if gridded_data.ndim > 3: if vert_scheme is None: vert_scheme = 'mean' if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES: raise ValueError( 'Vertical scheme {} is not supported'.format(vert_scheme)) grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons, latitude=ungridded_lats, vert_scheme=vert_scheme) pd_freq = TsType(col_freq).to_pandas_freq() time_idx = make_datetime_index(start, stop, pd_freq) coldata = np.empty((2, len(time_idx), len(obs_stat_data))) lons = [] lats = [] alts = [] station_names = [] ungridded_unit = None ts_type_src_ref = None if not harmonise_units: gridded_unit = str(gridded_data.units) else: gridded_unit = None # loop over all stations and append to colocated data object for i, obs_stat in enumerate(obs_stat_data): # ToDo: consider removing to keep ts_type_src_ref (this was probably # introduced for EBAS were the original data frequency is not constant # but can vary from site to site) if ts_type_src_ref is None: ts_type_src_ref = obs_stat['ts_type_src'] elif obs_stat['ts_type_src'] != ts_type_src_ref: spl = ts_type_src_ref.split(';') if not obs_stat['ts_type_src'] in spl: spl.append(obs_stat['ts_type_src']) ts_type_src_ref = ';'.join(spl) if ungridded_unit is None: try: ungridded_unit = obs_stat['var_info'][var_ref]['units'] except KeyError as e: #variable information or unit is not defined logger.exception(repr(e)) try: unit = obs_stat['var_info'][var_ref]['units'] except Exception: unit = None if not unit == ungridded_unit: raise ValueError( 'Cannot perform colocation. Ungridded data ' 'object contains different units ({})'.format(var_ref)) # get observations (Note: the index of the observation time series # is already in the specified frequency format, and thus, does not # need to be updated, for details (or if errors occur), cf. # UngriddedData.to_station_data, where the conversion happens) # get model station data grid_stat = grid_stat_data[i] if harmonise_units: grid_unit = grid_stat.get_unit(var) obs_unit = obs_stat.get_unit(var_ref) if not grid_unit == obs_unit: grid_stat.convert_unit(var, obs_unit) if gridded_unit is None: gridded_unit = obs_unit if remove_outliers and not var_keep_outliers: # don't check if harmonise_units is active, because the # remove_outliers method checks units based on AeroCom default # variables, and a variable mapping might be active, i.e. # sometimes models use abs550aer for absorption coefficients # with units [m-1] and not for AAOD (which is the AeroCom default # and unitless. Hence, unit check in remove_outliers works only # if the variable name (and unit) corresonds to AeroCom default) #chk_unit = not harmonise_units grid_stat.remove_outliers(var, low=low, high=high, check_unit=True) _df = _colocate_site_data_helper( stat_data=grid_stat, stat_data_ref=obs_stat, var=var, var_ref=var_ref, ts_type=col_freq, resample_how=resample_how, apply_time_resampling_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, use_climatology_ref=use_climatology_ref) # this try/except block was introduced on 23/2/2021 as temporary fix from # v0.10.0 -> v0.10.1 as a result of multi-weekly obsdata (EBAS) that # can end up resulting in incorrect number of timestamps after resampling # (the error was discovered using EBASMC, concpm10, 2019 and colocation # frequency monthly) try: # assign the unified timeseries data to the colocated data array coldata[0, :, i] = _df['ref'].values coldata[1, :, i] = _df['data'].values except ValueError as e: const.print_log.warning( f'Failed to colocate time for station {obs_stat.station_name}. ' f'This station will be skipped (error: {e})') lons.append(obs_stat.longitude) lats.append(obs_stat.latitude) alts.append(obs_stat.altitude) station_names.append(obs_stat.station_name) try: revision = ungridded_data.data_revision[dataset_ref] except Exception: try: revision = ungridded_data._get_data_revision_helper(dataset_ref) except MetaDataError: revision = 'MULTIPLE' except Exception: revision = 'n/a' files = [os.path.basename(x) for x in gridded_data.from_files] meta = { 'data_source': [dataset_ref, gridded_data.name], 'var_name': [var_ref, var], 'ts_type': col_freq, # will be updated below if resampling 'filter_name': filter_name, 'ts_type_src': [ts_type_src_ref, grid_ts_type_src], 'start_str': to_datestring_YYYYMMDD(start), 'stop_str': to_datestring_YYYYMMDD(stop), 'var_units': [ungridded_unit, gridded_unit], 'vert_scheme': vert_scheme, 'data_level': 3, 'revision_ref': revision, 'from_files': files, 'from_files_ref': None, 'stations_ignored': ignore_station_names, 'colocate_time': colocate_time, 'obs_is_clim': use_climatology_ref, 'pyaerocom': pya_ver, 'apply_constraints': apply_time_resampling_constraints, 'min_num_obs': min_num_obs, 'outliers_removed': remove_outliers } meta.update(regfilter.to_dict()) # create coordinates of DataArray coords = { 'data_source': meta['data_source'], 'var_name': ('data_source', meta['var_name']), 'var_units': ('data_source', meta['var_units']), 'ts_type_src': ('data_source', meta['ts_type_src']), 'time': time_idx, 'station_name': station_names, 'latitude': ('station_name', lats), 'longitude': ('station_name', lons), 'altitude': ('station_name', alts) } dims = ['data_source', 'time', 'station_name'] data = ColocatedData(data=coldata, coords=coords, dims=dims, name=var, attrs=meta) # add correct units for lat / lon dimensions data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name data.latitude.attrs['units'] = str(gridded_data.latitude.units) data.longitude.attrs[ 'standard_name'] = gridded_data.longitude.standard_name data.longitude.attrs['units'] = str(gridded_data.longitude.units) if col_freq != str(ts_type): data = data.resample_time( to_ts_type=ts_type, colocate_time=colocate_time, apply_constraints=apply_time_resampling_constraints, min_num_obs=min_num_obs, how=resample_how, **kwargs) return data
def check_and_load(self, var_name): """Check if cache file exists and load Note ---- If a cache file exists for this database, but cannot be loaded or is outdated against pyaerocom updates, then it will be removed (the latter only if :attr:`pyaerocom.const.RM_CACHE_OUTDATED` is True). Returns ------- bool True, if cache file exists and could be successfully loaded, else False. Note: if import is successful, the corresponding data object (instance of :class:`pyaerocom.UngriddedData` can be accessed via :attr:`loaded_data' Raises ------ TypeError if cached file is not an instance of :class:`pyaerocom.UngriddedData` class (which should not happen) """ try: fp = self.file_path(var_name) except FileNotFoundError as e: logger.warning(repr(e)) return False if not os.path.isfile(fp): logger.info('No cache file available for {}, {}' .format(self.dataset_to_read, var_name)) return False delete_existing = const.RM_CACHE_OUTDATED in_handle = open(fp, 'rb') try: ok = self._check_pkl_head_vs_database(in_handle) except Exception as e: ok = False delete_existing = True logger.exception('File error in cached data file {}. File will ' 'be removed and data reloaded' 'Error: {}'.format(fp, repr(e))) if not ok: # TODO: Should we delete the cache file if it is outdated ??? logger.info('Aborting reading cache file {}. Aerocom database ' 'or pyaerocom version has changed compared to ' 'cached version' .format(self.file_name(var_name))) in_handle.close() if delete_existing: #something was wrong const.print_log.info('Deleting outdated cache file: {}' .format(fp)) os.remove(self.file_path(var_name)) return False # everything is okay data = pickle.load(in_handle) if not isinstance(data, UngriddedData): raise TypeError('Unexpected data type stored in cache file, need ' 'instance of UngriddedData, got {}' .format(type(data))) self.loaded_data[var_name] = data logger.info('Successfully loaded data for {} from Cache' .format(self.dataset_to_read)) return True
class CacheHandlerUngridded(object): """Interface for reading and writing of cache files Cache filename mask is <dataset_to_read>_<var>.pkl e.g. EBASMC_scatc550aer.pkl Attributes ---------- reader : ReadUngriddedBase reading class for dataset loaded_data : dict dictionary containing successfully loaded instances of single variable :class:`UngriddedData` objects (keys are variable names) """ __version__ = '1.00' #: Directory of cache files try: CACHE_DIR = const.CACHEDIR except: CACHE_DIR = None logger.exception('Pyaerocom cache directory is not defined') #: Cache file header keys that are checked (and required unchanged) when #: reading a cache file CACHE_HEAD_KEYS = ['pyaerocom_version', 'newest_file_in_read_dir', 'newest_file_date_in_read_dir', 'data_revision', 'reader_version', 'ungridded_data_version', 'cacher_version'] def __init__(self, reader=None, cache_dir=None, **kwargs): self._reader = None self.reader = reader self.loaded_data = {} self._cache_dir = cache_dir @property def reader(self): """Instance of reader class""" if self._reader is None: raise AttributeError('No reader class assigned to cache object') return self._reader @reader.setter def reader(self, val): from pyaerocom.io import ReadUngriddedBase if not isinstance(val, ReadUngriddedBase): try: val = val.get_reader() if not isinstance(val, ReadUngriddedBase): raise TypeError('Invalid input for reader') except: raise TypeError('Invalid input for reader') self._reader = val self.loaded_data = {} @property def cache_dir(self): """Directory where cached files are stored""" if self._cache_dir is not None: return self._cache_dir if self.CACHE_DIR is None or not os.path.exists(self.CACHE_DIR): raise FileNotFoundError('Cache directory does not exist: {}' .format(self.CACHE_DIR)) return self.CACHE_DIR @cache_dir.setter def cache_dir(self, val): if not isinstance(val, str) or not os.path.exists(val): raise FileNotFoundError('Input directory does not exist: {}' .format(val)) self._cache_dir = val @property def dataset_to_read(self): """Data ID of the associated dataset""" return self.reader.dataset_to_read @property def data_dir(self): """Data directory of the associated dataset""" return self.reader.DATASET_PATH def file_name(self, var_name): """File name of cache file""" name = '_'.join([self.dataset_to_read, var_name]) return name + '.pkl' def file_path(self, var_name): """File path of cache file""" return os.path.join(self.cache_dir, self.file_name(var_name)) def _check_pkl_head_vs_database(self, in_handle): current = self.cache_meta_info() head = pickle.load(in_handle) if not isinstance(head, dict): raise CacheReadError('Invalid cache file') for k, v in head.items(): if not k in current: raise CacheReadError('Invalid cache header key: {}'.format(k)) elif not v == current[k]: const.print_log.info('{} is outdated (value: {}). Current ' 'value: {}'.format(k, v, current[k])) return False return True def cache_meta_info(self): """Dictionary containing relevant caching meta-info""" try: newest = max(glob.iglob(os.path.join(self.data_dir, '*')), key=os.path.getctime) newest_date = os.path.getctime(newest) except Exception as e: raise AerocomConnectionError('Failed to establish connection to ' 'data server. Reason: {}'.repr(e)) d = dict.fromkeys(self.CACHE_HEAD_KEYS) from pyaerocom import __version__ d['pyaerocom_version'] = __version__ d['newest_file_in_read_dir'] = newest d['newest_file_date_in_read_dir'] = newest_date d['data_revision'] = self.reader.data_revision d['reader_version'] = self.reader.__version__ d['ungridded_data_version'] = UngriddedData.__version__ d['cacher_version'] = self.__version__ return d def check_and_load(self, var_name): """Check if cache file exists and load Note ---- If a cache file exists for this database, but cannot be loaded or is outdated against pyaerocom updates, then it will be removed (the latter only if :attr:`pyaerocom.const.RM_CACHE_OUTDATED` is True). Returns ------- bool True, if cache file exists and could be successfully loaded, else False. Note: if import is successful, the corresponding data object (instance of :class:`pyaerocom.UngriddedData` can be accessed via :attr:`loaded_data' Raises ------ TypeError if cached file is not an instance of :class:`pyaerocom.UngriddedData` class (which should not happen) """ try: fp = self.file_path(var_name) except FileNotFoundError as e: logger.warning(repr(e)) return False if not os.path.isfile(fp): logger.info('No cache file available for {}, {}' .format(self.dataset_to_read, var_name)) return False delete_existing = const.RM_CACHE_OUTDATED in_handle = open(fp, 'rb') try: ok = self._check_pkl_head_vs_database(in_handle) except Exception as e: ok = False delete_existing = True logger.exception('File error in cached data file {}. File will ' 'be removed and data reloaded' 'Error: {}'.format(fp, repr(e))) if not ok: # TODO: Should we delete the cache file if it is outdated ??? logger.info('Aborting reading cache file {}. Aerocom database ' 'or pyaerocom version has changed compared to ' 'cached version' .format(self.file_name(var_name))) in_handle.close() if delete_existing: #something was wrong const.print_log.info('Deleting outdated cache file: {}' .format(fp)) os.remove(self.file_path(var_name)) return False # everything is okay data = pickle.load(in_handle) if not isinstance(data, UngriddedData): raise TypeError('Unexpected data type stored in cache file, need ' 'instance of UngriddedData, got {}' .format(type(data))) self.loaded_data[var_name] = data logger.info('Successfully loaded data for {} from Cache' .format(self.dataset_to_read)) return True def write(self, data, var_name=None): """Write single-variable instance of UngriddedData to cache Parameters ---------- data : UngriddedData object containing the data (possibly containing multiple variables) var_name : str, optional name of variable that is supposed to be stored (only required if input `data` contains more than one variable) """ meta = self.cache_meta_info() if not isinstance(data, UngriddedData): raise TypeError('Invalid input, need instance of UngriddedData, ' 'got {}'.format(type(data))) if len(data.contains_datasets) > 1: raise CacheWriteError('Input UngriddedData object contains ' 'datasets: {}. Can only write single ' 'dataset objects' .format(data.contains_datasets)) if var_name is None: if len(data.contains_vars) > 1: raise CacheWriteError('Input UngriddedData object for {} contains ' 'more than one variable: {}. Please ' 'specify which variable should be ' 'cached' .format(self.reader.data_id, data.contains_vars)) var_name = data.contains_vars[0] elif not var_name in data.contains_vars: raise CacheWriteError('Cannot write cache file: variable {} does ' 'not exist in input UngriddedData object' .format(var_name)) if len(data.contains_vars) > 1: data = data.extract_var(var_name) fp = self.file_path(var_name) logger.info('Writing cache file: {}'.format(fp)) success = True # OutHandle = gzip.open(c__cache_file, 'wb') # takes too much time out_handle = open(fp, 'wb') try: # write cache header pickle.dump(meta, out_handle, pickle.HIGHEST_PROTOCOL) # write data pickle.dump(data, out_handle, pickle.HIGHEST_PROTOCOL) except Exception as e: from pyaerocom import print_log print_log.exception('Failed to write cache'.format(repr(e))) success=False finally: out_handle.close() if not success: os.remove(self.file_path) logger.info('Successfully wrote {} data ({}) to disk!' .format(var_name, self.reader.data_id)) def __str__(self): return 'Cache handler for {}'.format(self.reader.data_id)