Пример #1
0
 def __init__(self, reader, vars_to_retrieve=None, 
              start=None, stop=None, **kwargs):
 
     self.reader = reader
    
     self.vars_to_retrieve = vars_to_retrieve
     self._start = start
     self._stop = stop
     
     self.loaded_data = None
     
     #: Flag that is set True if pyaerocom server can be accessed. If this
     #: is the case, potentially pickled files are checked against latest
     #: revision and file available in database, else, pickled match is 
     #: loaded without doublechecking whether the database has changed
     self.connection_established = False
     self.newest_file_in_read_dir = None
     self.newest_file_date_in_read_dir = None
     
     try:# get latest file in data directory of obs network ...
         newest = max(glob.iglob(os.path.join(self.data_dir, '*')), 
                      key=os.path.getctime)
         self.newest_file_in_read_dir = newest
         
         # ... and corresponding file date
         self.newest_file_date_in_read_dir = os.path.getctime(newest)
         self.connection_established = True
     except IOError as e:
         logger.exception('Failed to establish connection with Aerocom '
                          'server. Error: {}'.format(repr(e)))
Пример #2
0
 def decode(self):
     flags = np.zeros((len(self.raw_data), 3)).astype(int)
     mask = self.raw_data.astype(bool)
     not_ok = self.raw_data[mask]
     if len(not_ok) > 0:
         decoded = []
         for flag in not_ok:
             item = "{:.9f}".format(flag).split(".")[1]
             decoded.append([int(item[:3]), int(item[3:6]), int(item[6:9])])
         try:
             flags[mask] = np.asarray(decoded)
         except:
             logger.exception('Failed to decode flag')
     self.flags = flags
Пример #3
0
 def write(self, data):
     """Write instance of UngriddedData to cache
     
     Parameters
     ----------
     data : UngriddedData
         object containing the data
     """
     if not self.connection_established:
         # TODO: may be updated in the future
         raise AerocomConnectionError('Cannot write Cache file, connection '
                                      'to Aerocom database could not be '
                                      'established (required for checking '
                                      'revision)')
     if not isinstance(data, UngriddedData):
         raise TypeError('Invalid input, need instance of UngriddedData, '
                         'got {}'.format(type(data)))
     logger.info('Writing cache file: {}'.format(self.file_path))
     success=True
     # OutHandle = gzip.open(c__cache_file, 'wb') # takes too much time
     out_handle = open(self.file_path, 'wb')
     try:
         pickle.dump(self.newest_file_in_read_dir, out_handle, 
                     pickle.HIGHEST_PROTOCOL)
         pickle.dump(self.newest_file_date_in_read_dir, out_handle, 
                     pickle.HIGHEST_PROTOCOL)
         pickle.dump(self.reader.data_revision, out_handle, 
                     pickle.HIGHEST_PROTOCOL)
         pickle.dump(self.reader.__version__, out_handle, 
                     pickle.HIGHEST_PROTOCOL)
         pickle.dump(UngriddedData.__version__, out_handle, 
                     pickle.HIGHEST_PROTOCOL)
         pickle.dump(self.__version__, out_handle, 
                     pickle.HIGHEST_PROTOCOL)
         pickle.dump(data, out_handle, pickle.HIGHEST_PROTOCOL)
     except:
         logger.exception('Failed to write cache')
         success=False
     finally:    
         out_handle.close()
         if not success:
             os.remove(self.file_path)
     
     logger.info('Success!')
Пример #4
0
 def check_and_load(self):
     if not os.path.isfile(self.file_path):
         logger.info('No cache file available for query of dataset '
                     '{}'.format(self.dataset_to_read))
         return False
     
     delete_existing = False
     in_handle = open(self.file_path, 'rb')
     # read meta information about file
     if self.connection_established:
         try:
             use_cache_file = self._check_pkl_head_vs_database(in_handle)
         except Exception as e:
             use_cache_file = False
             delete_existing = True
             logger.exception('File error in cached data file {}. File will '
                              'be removed and data reloaded'
                              'Error: {}'.format(self.file_path,
                                      repr(e)))
         if not use_cache_file:
             # TODO: Should we delete the cache file if it is outdated ???
             logger.info('Aborting reading cache file {}. Aerocom database '
                         'has changed compared to cached version'
                         .format(self.file_name))
             in_handle.close()
             if delete_existing: #something was wrong
                 os.remove(self.file_path)
             return False
     else:
         for k in range(self.LEN_CACHE_HEAD):
             logger.debug(pickle.load(in_handle))
     # everything is okay
     data = pickle.load(in_handle)
     if not isinstance(data, UngriddedData):
         raise TypeError('Unexpected data type stored in cache file, need '
                         'instance of UngriddedData, got {}'.format(type(data)))
     self.loaded_data = data
     logger.info('Successfully loaded data for {} from Cache'.format(self.dataset_to_read))
     return True
Пример #5
0
def colocate_gridded_ungridded(gridded_data,
                               ungridded_data,
                               ts_type=None,
                               start=None,
                               stop=None,
                               filter_name=None,
                               regrid_res_deg=None,
                               remove_outliers=True,
                               vert_scheme=None,
                               harmonise_units=True,
                               var_ref=None,
                               var_outlier_ranges=None,
                               var_ref_outlier_ranges=None,
                               update_baseyear_gridded=None,
                               ignore_station_names=None,
                               apply_time_resampling_constraints=None,
                               min_num_obs=None,
                               colocate_time=False,
                               var_keep_outliers=True,
                               var_ref_keep_outliers=False,
                               **kwargs):
    """Colocate gridded with ungridded data 
    
    Note
    ----
    Uses the variable that is contained in input :class:`GriddedData` object 
    (since these objects only contain a single variable)
    
    Parameters
    ----------
    gridded_data : GriddedData
        gridded data (e.g. model results)
    ungridded_data : UngriddedData
        ungridded data (e.g. observations)
    ts_type : str
        desired temporal resolution of colocated data (must be valid AeroCom
        ts_type str such as daily, monthly, yearly.). The colocation itself is
        done in the highest available resolution and resampling to `ts_type` is
        done afterwards. You may change this behaviour by setting input param
        `resample_first=True` (default is False).
    start : :obj:`str` or :obj:`datetime64` or similar, optional
        start time for colocation, if None, the start time of the input
        :class:`GriddedData` object is used
    stop : :obj:`str` or :obj:`datetime64` or similar, optional
        stop time for colocation, if None, the stop time of the input
        :class:`GriddedData` object is used
    filter_name : str
        string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for
        details). If None, then it is set to 'WORLD-wMOUNTAINS', which 
        corresponds to no filtering (world with mountains). 
        Use WORLD-noMOUNTAINS to exclude mountain sites.
    regrid_res_deg : :obj:`int`, optional
        regrid resolution in degrees. If specified, the input gridded data 
        object will be regridded in lon / lat dimension to the input 
        resolution. (BETA feature)
    remove_outliers : bool
        if True, outliers are removed from model and obs data before colocation, 
        else not.
    vert_scheme : str
        string specifying scheme used to reduce the dimensionality in case 
        input grid data contains vertical dimension. Example schemes are 
        `mean, surface, altitude`, for details see 
        :func:`GriddedData.to_time_series`.
    harmonise_units : bool
        if True, units are attempted to be harmonised (note: raises Exception
        if True and units cannot be harmonised).
    var_ref : :obj:`str`, optional
        variable against which data in :attr:`gridded_data` is supposed to be
        compared. If None, then the same variable is used 
        (i.e. `gridded_data.var_name`).
    var_outlier_ranges : dict, optional
        dictionary specifying outlier ranges for dataset to be analysed
        (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then
        the pyaerocom default outlier ranges are used for the input variable.
        Defaults to None.
    var_ref_outlier_ranges : dict, optional
        like `var_outlier_ranges` but for reference dataset.
    update_baseyear_gridded : int, optional
        optional input that can be set in order to re-define the time dimension
        in the gridded data object to be analysed. E.g., if the data object 
        is a climatology (one year of data) that has set the base year of the
        time dimension to a value other than the specified input start / stop 
        time this may be used to update the time in order to make colocation 
        possible.
    ignore_station_names : str or list, optional
        station name or pattern or list of station names or patterns that should
        be ignored
    apply_time_resampling_constraints : bool, optional
        if True, then time resampling constraints are applied as provided via 
        :attr:`min_num_obs` or if that one is unspecified, as defined in
        :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than 
        :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used
        (which defaults to True !!).
    min_num_obs : int or dict, optional
        minimum number of observations for resampling of time
    colocate_time : bool
        if True and if original time resolution of data is higher than desired
        time resolution (`ts_type`), then both datasets are colocated in time 
        *before* resampling to lower resolution. 
    var_keep_outliers : bool
        if True, then no outliers will be removed from dataset to be analysed, 
        even if `remove_outliers` is True. That is because for model evaluation
        often only outliers are supposed to be removed in the observations but
        not in the model.
    var_ref_keep_outliers : bool
        if True, then no outliers will be removed from the reference dataset, 
        even if `remove_outliers` is True.
    **kwargs
        additional keyword args (passed to 
        :func:`UngriddedData.to_station_data_all`)
        
    Returns
    -------
    ColocatedData
        instance of colocated data
        
    Raises
    ------
    VarNotAvailableError
        if grid data variable is not available in ungridded data object
    AttributeError
        if instance of input :class:`UngriddedData` object contains more than
        one dataset
    TimeMatchError
        if gridded data time range does not overlap with input time range
    ColocationError
        if none of the data points in input :class:`UngriddedData` matches 
        the input colocation constraints
    """
    if var_outlier_ranges is None:
        var_outlier_ranges = {}
    if var_ref_outlier_ranges is None:
        var_ref_outlier_ranges = {}

    if filter_name is None:
        filter_name = 'WORLD-wMOUNTAINS'

    var = gridded_data.var_name
    aerocom_var = gridded_data.var_name_aerocom
    if var_ref is None:
        var_ref = aerocom_var

    if remove_outliers:
        low, high, low_ref, high_ref = None, None, None, None
        if var in var_outlier_ranges:
            low, high = var_outlier_ranges[var]
        if var_ref in var_ref_outlier_ranges:
            low_ref, high_ref = var_ref_outlier_ranges[var_ref]

    if not var_ref in ungridded_data.contains_vars:
        raise VarNotAvailableError('Variable {} is not available in ungridded '
                                   'data (which contains {})'.format(
                                       var_ref, ungridded_data.contains_vars))
    elif len(ungridded_data.contains_datasets) > 1:
        raise AttributeError('Colocation can only be performed with '
                             'ungridded data objects that only contain a '
                             'single dataset. Use method `extract_dataset` of '
                             'UngriddedData object to extract single datasets')

    dataset_ref = ungridded_data.contains_datasets[0]

    if update_baseyear_gridded is not None:
        # update time dimension in gridded data
        gridded_data.base_year = update_baseyear_gridded
    # get start / stop of gridded data as pandas.Timestamp
    grid_start = to_pandas_timestamp(gridded_data.start)
    grid_stop = to_pandas_timestamp(gridded_data.stop)

    grid_ts_type = gridded_data.ts_type

    if ts_type is None:
        ts_type = grid_ts_type
    if start is None:
        start = grid_start
    else:
        start = to_pandas_timestamp(start)
    if stop is None:
        stop = grid_stop
    else:
        stop = to_pandas_timestamp(stop)

    if start < grid_start:
        start = grid_start
    if stop > grid_stop:
        stop = grid_stop
    # check overlap
    if stop < grid_start or start > grid_stop:
        raise TimeMatchError('Input time range {}-{} does not '
                             'overlap with data range: {}-{}'.format(
                                 start, stop, grid_start, grid_stop))
    # create instance of Filter class (may, in the future, also include all
    # filter options, e.g. start, stop, variables, only land, only oceans, and
    # may also be linked with other data object, e.g. if data is only supposed
    # to be used if other data object exceeds a certain threshold... but for
    # now, only region and altitude range)
    regfilter = Filter(name=filter_name)

    # apply filter to data
    ungridded_data = regfilter(ungridded_data)

    #crop time
    gridded_data = gridded_data.crop(time_range=(start, stop))

    if regrid_res_deg is not None:

        lons = gridded_data.longitude.points
        lats = gridded_data.latitude.points

        lons_new = np.arange(lons.min(), lons.max(), regrid_res_deg)
        lats_new = np.arange(lats.min(), lats.max(), regrid_res_deg)

        gridded_data = gridded_data.interpolate(latitude=lats_new,
                                                longitude=lons_new)

    ungridded_freq = None  # that keeps ungridded data in original resolution

    if not colocate_time:
        gridded_data = gridded_data.resample_time(to_ts_type=ts_type)
        ungridded_freq = ts_type  # converts ungridded data directly to desired resolution

    # ts_type that is used for colocation
    col_ts_type = gridded_data.ts_type

    # pandas frequency string that corresponds to col_ts_type
    col_freq = TS_TYPE_TO_PANDAS_FREQ[col_ts_type]

    if remove_outliers and not var_ref_keep_outliers:
        ungridded_data.remove_outliers(var_ref,
                                       inplace=True,
                                       low=low_ref,
                                       high=high_ref)

    all_stats = ungridded_data.to_station_data_all(
        vars_to_convert=var_ref,
        start=start,
        stop=stop,
        freq=ungridded_freq,
        by_station_name=True,
        ignore_index=ignore_station_names,
        apply_constraints=apply_time_resampling_constraints,
        min_num_obs=min_num_obs,
        **kwargs)

    obs_stat_data = all_stats['stats']
    ungridded_lons = all_stats['longitude']
    ungridded_lats = all_stats['latitude']

    # resampling constraints may have been altered in case input was None,
    # thus overwrite
    vi = obs_stat_data[0]['var_info'][var_ref]
    if 'apply_constraints' in vi:
        apply_time_resampling_constraints = vi['apply_constraints']
        min_num_obs = vi['min_num_obs']

    if len(obs_stat_data) == 0:
        raise VarNotAvailableError('Variable {} is not available in specified '
                                   'time interval ({}-{})'.format(
                                       var_ref, start, stop))
    # make sure the gridded data is in the right dimension
    try:
        gridded_data.check_dimcoords_tseries()
    except DimensionOrderError:
        gridded_data.reorder_dimensions_tseries()

    if gridded_data.ndim > 3:
        if vert_scheme is None:
            vert_scheme = 'mean'
        if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES:
            raise ValueError(
                'Vertical scheme {} is not supported'.format(vert_scheme))

    grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons,
                                                 latitude=ungridded_lats,
                                                 vert_scheme=vert_scheme)

    # Generate time index of ColocatedData object
    time_idx = pd.DatetimeIndex(start=start, end=stop, freq=col_freq)
    #periods = time_idx.to_period(col_freq)
    # =============================================================================
    #     if col_freq in PANDAS_RESAMPLE_OFFSETS:
    #         offs = np.timedelta64(1, '[{}]'.format(PANDAS_RESAMPLE_OFFSETS[col_freq]))
    #         time_idx = time_idx + offs
    # =============================================================================

    coldata = np.empty((2, len(time_idx), len(obs_stat_data)))

    lons = []
    lats = []
    alts = []
    station_names = []

    ungridded_unit = None
    ts_type_src_ref = None
    if not harmonise_units:
        gridded_unit = str(gridded_data.units)
    else:
        gridded_unit = None

    # loop over all stations and append to colocated data object
    for i, obs_stat in enumerate(obs_stat_data):

        if ts_type_src_ref is None:
            ts_type_src_ref = obs_stat['ts_type_src']
        elif obs_stat['ts_type_src'] != ts_type_src_ref:
            spl = ts_type_src_ref.split(';')
            if not obs_stat['ts_type_src'] in spl:
                spl.append(obs_stat['ts_type_src'])
            ts_type_src_ref = ';'.join(spl)

        if ungridded_unit is None:
            try:
                ungridded_unit = obs_stat['var_info'][var_ref]['units']
            except KeyError as e:  #variable information or unit is not defined
                logger.exception(repr(e))
        try:
            unit = obs_stat['var_info'][var_ref]['units']
        except:
            unit = None
        if not unit == ungridded_unit:
            raise ValueError(
                'Cannot perform colocation. Ungridded data '
                'object contains different units ({})'.format(var_ref))
        # get observations (Note: the index of the observation time series
        # is already in the specified frequency format, and thus, does not
        # need to be updated, for details (or if errors occur), cf.
        # UngriddedData.to_station_data, where the conversion happens)

        # get model data corresponding to station
        grid_stat = grid_stat_data[i]
        if harmonise_units:
            grid_unit = grid_stat.get_unit(var)
            obs_unit = obs_stat.get_unit(var_ref)
            if not grid_unit == obs_unit:
                grid_stat.convert_unit(var, obs_unit)
            if gridded_unit is None:
                gridded_unit = obs_unit

        if remove_outliers and not var_keep_outliers:
            # don't check if harmonise_units is active, because the
            # remove_outliers method checks units based on AeroCom default
            # variables, and a variable mapping might be active, i.e.
            # sometimes models use abs550aer for absorption coefficients
            # with units [m-1] and not for AAOD (which is the AeroCom default
            # and unitless. Hence, unit check in remove_outliers works only
            # if the variable name (and unit) corresonds to AeroCom default)
            #chk_unit = not harmonise_units
            grid_stat.remove_outliers(var, low=low, high=high, check_unit=True)

        # get grid and obs timeseries data (that may be sampled in arbitrary
        # time resolution, particularly the obs data)
        grid_ts = grid_stat[var]
        obs_ts = obs_stat[var_ref]

        # resample to the colocation frequency
        obs_ts1 = obs_ts.resample(col_freq).mean()
        grid_ts1 = grid_ts.resample(col_freq).mean()

        # fill up missing time stamps
        _df = pd.concat([obs_ts1, grid_ts1], axis=1, keys=['o', 'm'])

        # assign the unified timeseries data to the colocated data array
        coldata[0, :, i] = _df['o'].values
        coldata[1, :, i] = _df['m'].values

        lons.append(obs_stat.longitude)
        lats.append(obs_stat.latitude)
        alts.append(obs_stat.altitude)
        station_names.append(obs_stat.station_name)

    try:
        revision = ungridded_data.data_revision[dataset_ref]
    except:
        try:
            revision = ungridded_data._get_data_revision_helper(dataset_ref)
        except MetaDataError:
            revision = 'MULTIPLE'
        except:
            revision = 'n/a'

    files = [os.path.basename(x) for x in gridded_data.from_files]

    meta = {
        'data_source': [dataset_ref, gridded_data.name],
        'var_name': [var_ref, var],
        'ts_type': col_ts_type,
        'filter_name': filter_name,
        'ts_type_src': [ts_type_src_ref, grid_ts_type],
        'start_str': to_datestring_YYYYMMDD(start),
        'stop_str': to_datestring_YYYYMMDD(stop),
        'var_units': [ungridded_unit, gridded_unit],
        'vert_scheme': vert_scheme,
        'data_level': 3,
        'revision_ref': revision,
        'from_files': files,
        'from_files_ref': None,
        'stations_ignored': ignore_station_names,
        'colocate_time': colocate_time,
        'apply_constraints': apply_time_resampling_constraints,
        'min_num_obs': min_num_obs,
        'outliers_removed': remove_outliers
    }

    meta.update(regfilter.to_dict())

    # create coordinates of DataArray
    coords = {
        'data_source': meta['data_source'],
        'var_name': ('data_source', meta['var_name']),
        'var_units': ('data_source', meta['var_units']),
        'ts_type_src': ('data_source', meta['ts_type_src']),
        'time': time_idx,
        'station_name': station_names,
        'latitude': ('station_name', lats),
        'longitude': ('station_name', lons),
        'altitude': ('station_name', alts)
    }

    dims = ['data_source', 'time', 'station_name']
    data = ColocatedData(data=coldata,
                         coords=coords,
                         dims=dims,
                         name=var,
                         attrs=meta)

    if colocate_time and grid_ts_type != ts_type:
        data = data.resample_time(
            to_ts_type=ts_type,
            colocate_time=True,
            apply_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            **kwargs)
    return data
Пример #6
0
def colocate_gridded_ungridded(gridded_data,
                               ungridded_data,
                               ts_type=None,
                               start=None,
                               stop=None,
                               filter_name=None,
                               regrid_res_deg=None,
                               remove_outliers=True,
                               vert_scheme=None,
                               harmonise_units=True,
                               regrid_scheme='areaweighted',
                               var_ref=None,
                               var_outlier_ranges=None,
                               var_ref_outlier_ranges=None,
                               update_baseyear_gridded=None,
                               ignore_station_names=None,
                               apply_time_resampling_constraints=None,
                               min_num_obs=None,
                               colocate_time=False,
                               var_keep_outliers=True,
                               var_ref_keep_outliers=False,
                               use_climatology_ref=False,
                               resample_how=None,
                               **kwargs):
    """Colocate gridded with ungridded data (low level method)

    For high-level colocation see :class:`pyaerocom.colocation_auto.Colocator`
    and :class:`pyaerocom.colocation_auto.ColocationSetup`

    Note
    ----
    Uses the variable that is contained in input :class:`GriddedData` object
    (since these objects only contain a single variable). If this variable
    is not contained in observation data (or contained but using a different
    variable name) you may specify the obs variable to be used via input arg
    `var_ref`

    Parameters
    ----------
    gridded_data : GriddedData
        gridded data object (e.g. model results).
    ungridded_data : UngriddedData
        ungridded data object (e.g. observations).
    ts_type : str
        desired temporal resolution of colocated data (must be valid AeroCom
        ts_type str such as daily, monthly, yearly.).
    start : :obj:`str` or :obj:`datetime64` or similar, optional
        start time for colocation, if None, the start time of the input
        :class:`GriddedData` object is used.
    stop : :obj:`str` or :obj:`datetime64` or similar, optional
        stop time for colocation, if None, the stop time of the input
        :class:`GriddedData` object is used
    filter_name : str
        string specifying filter used (cf. :class:`pyaerocom.filter.Filter` for
        details). If None, then it is set to 'WORLD-wMOUNTAINS', which
        corresponds to no filtering (world with mountains).
        Use WORLD-noMOUNTAINS to exclude mountain sites.
    regrid_res_deg : int or dict, optional
        regrid resolution in degrees. If specified, the input gridded data
        object will be regridded in lon / lat dimension to the input
        resolution (if input is integer, both lat and lon are regridded to that
        resolution, if input is dict, use keys `lat_res_deg` and `lon_res_deg`
        to specify regrid resolutions, respectively).
    remove_outliers : bool
        if True, outliers are removed from model and obs data before colocation,
        else not. Outlier ranges can be specified via input args
        `var_outlier_ranges` and `var_ref_outlier_ranges`.
    vert_scheme : str
        string specifying scheme used to reduce the dimensionality in case
        input grid data contains vertical dimension. Example schemes are
        `mean, surface, altitude`, for details see
        :func:`GriddedData.to_time_series`.
    harmonise_units : bool
        if True, units are attempted to be harmonised (note: raises Exception
        if True and units cannot be harmonised).
    var_ref : :obj:`str`, optional
        variable against which data in :attr:`gridded_data` is supposed to be
        compared. If None, then the same variable is used
        (i.e. `gridded_data.var_name`).
    var_outlier_ranges : dict, optional
        dictionary specifying outlier ranges for dataset to be analysed
        (e.g. dict(od550aer = [-0.05, 10], ang4487aer=[0,4])). If None, then
        the pyaerocom default outlier ranges are used for the input variable.
        Defaults to None.
    var_ref_outlier_ranges : dict, optional
        like `var_outlier_ranges` but for reference dataset.
    update_baseyear_gridded : int, optional
        optional input that can be set in order to re-define the time dimension
        in the gridded data object to be analysed. E.g., if the data object
        is a climatology (one year of data) that has set the base year of the
        time dimension to a value other than the specified input start / stop
        time this may be used to update the time in order to make colocation
        possible.
    ignore_station_names : str or list, optional
        station name or pattern or list of station names or patterns that should
        be ignored
    apply_time_resampling_constraints : bool, optional
        if True, then time resampling constraints are applied as provided via
        :attr:`min_num_obs` or if that one is unspecified, as defined in
        :attr:`pyaerocom.const.OBS_MIN_NUM_RESAMPLE`. If None, than
        :attr:`pyaerocom.const.OBS_APPLY_TIME_RESAMPLE_CONSTRAINTS` is used
        (which defaults to True !!).
    min_num_obs : int or dict, optional
        minimum number of observations for resampling of time
    colocate_time : bool
        if True and if original time resolution of data is higher than desired
        time resolution (`ts_type`), then both datasets are colocated in time
        *before* resampling to lower resolution.
    var_keep_outliers : bool
        if True, then no outliers will be removed from dataset to be analysed,
        even if `remove_outliers` is True. That is because for model evaluation
        often only outliers are supposed to be removed in the observations but
        not in the model.
    var_ref_keep_outliers : bool
        if True, then no outliers will be removed from the reference dataset,
        even if `remove_outliers` is True.
    use_climatology_ref : bool
        if True, climatological timeseries are used from observations
    resample_how : str or dict
        string specifying how data should be aggregated when resampling in time.
        Default is "mean". Can also be a nested dictionary, e.g.
        resample_how={'daily': {'hourly' : 'max'}} would use the maximum value
        to aggregate from hourly to daily, rather than the mean.
    **kwargs
        additional keyword args (passed to
        :func:`UngriddedData.to_station_data_all`)

    Returns
    -------
    ColocatedData
        instance of colocated data

    Raises
    ------
    VarNotAvailableError
        if grid data variable is not available in ungridded data object
    AttributeError
        if instance of input :class:`UngriddedData` object contains more than
        one dataset
    TimeMatchError
        if gridded data time range does not overlap with input time range
    ColocationError
        if none of the data points in input :class:`UngriddedData` matches
        the input colocation constraints
    """
    if var_outlier_ranges is None:
        var_outlier_ranges = {}
    if var_ref_outlier_ranges is None:
        var_ref_outlier_ranges = {}

    if filter_name is None:
        filter_name = const.DEFAULT_REG_FILTER

    try:
        gridded_data.check_dimcoords_tseries()
    except DimensionOrderError:
        gridded_data.reorder_dimensions_tseries()

    var = gridded_data.var_name
    aerocom_var = gridded_data.var_name_aerocom

    _check_var_registered(var, aerocom_var, gridded_data)

    if var_ref is None:
        if aerocom_var is not None:
            var_ref = aerocom_var
        else:
            var_ref = var

    if remove_outliers:
        low, high, low_ref, high_ref = None, None, None, None
        if var in var_outlier_ranges:
            low, high = var_outlier_ranges[var]
        if var_ref in var_ref_outlier_ranges:
            low_ref, high_ref = var_ref_outlier_ranges[var_ref]

    if not var_ref in ungridded_data.contains_vars:
        raise VarNotAvailableError('Variable {} is not available in ungridded '
                                   'data (which contains {})'.format(
                                       var_ref, ungridded_data.contains_vars))
    elif len(ungridded_data.contains_datasets) > 1:
        raise AttributeError('Colocation can only be performed with '
                             'ungridded data objects that only contain a '
                             'single dataset. Use method `extract_dataset` of '
                             'UngriddedData object to extract single datasets')

    dataset_ref = ungridded_data.contains_datasets[0]

    if update_baseyear_gridded is not None:
        # update time dimension in gridded data
        gridded_data.base_year = update_baseyear_gridded

    grid_ts_type_src = gridded_data.ts_type
    grid_ts_type = TsType(gridded_data.ts_type)
    if isinstance(ts_type, str):
        ts_type = TsType(ts_type)
    if ts_type is None or grid_ts_type < ts_type:
        ts_type = grid_ts_type
    elif grid_ts_type > ts_type and not colocate_time:
        gridded_data = gridded_data.resample_time(
            str(ts_type),
            apply_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            how=resample_how)
        grid_ts_type = ts_type

    # get start / stop of gridded data as pandas.Timestamp
    grid_start = to_pandas_timestamp(gridded_data.start)
    grid_stop = to_pandas_timestamp(gridded_data.stop)

    if start is None:
        start = grid_start
    else:
        start = to_pandas_timestamp(start)
    if stop is None:
        stop = grid_stop
    else:
        stop = to_pandas_timestamp(stop)

    if start < grid_start:
        start = grid_start
    if stop > grid_stop:
        stop = grid_stop
    # check overlap
    if stop < grid_start or start > grid_stop:
        raise TimeMatchError('Input time range {}-{} does not '
                             'overlap with data range: {}-{}'.format(
                                 start, stop, grid_start, grid_stop))
    # create instance of Filter class (may, in the future, also include all
    # filter options, e.g. start, stop, variables, only land, only oceans, and
    # may also be linked with other data object, e.g. if data is only supposed
    # to be used if other data object exceeds a certain threshold... but for
    # now, only region and altitude range)
    regfilter = Filter(name=filter_name)

    # apply filter to data
    ungridded_data = regfilter.apply(ungridded_data)

    #crop time
    gridded_data = regfilter.apply(gridded_data)
    if start > grid_start or stop < grid_stop:
        gridded_data = gridded_data.crop(time_range=(start, stop))

    if regrid_res_deg is not None:
        gridded_data = _regrid_gridded(gridded_data, regrid_scheme,
                                       regrid_res_deg)

    if remove_outliers and not var_ref_keep_outliers:  #called twice if used via Colocator, this should go out here
        ungridded_data.remove_outliers(var_ref,
                                       inplace=True,
                                       low=low_ref,
                                       high=high_ref)

    if use_climatology_ref:
        col_freq = 'monthly'
        obs_start = const.CLIM_START
        obs_stop = const.CLIM_STOP
    else:
        col_freq = str(grid_ts_type)  #TS_TYPE_TO_PANDAS_FREQ[grid_ts_type]
        obs_start = start
        obs_stop = stop

    latitude = gridded_data.latitude.points
    longitude = gridded_data.longitude.points
    lat_range = [np.min(latitude), np.max(latitude)]
    lon_range = [np.min(longitude), np.max(longitude)]
    ungridded_data = ungridded_data.filter_by_meta(latitude=lat_range,
                                                   longitude=lon_range)

    # get timeseries from all stations in provided time resolution
    # (time resampling is done below in main loop)
    all_stats = ungridded_data.to_station_data_all(
        vars_to_convert=var_ref,
        start=obs_start,
        stop=obs_stop,
        by_station_name=True,
        ignore_index=ignore_station_names,
        **kwargs)

    obs_stat_data = all_stats['stats']
    ungridded_lons = all_stats['longitude']
    ungridded_lats = all_stats['latitude']

    if len(obs_stat_data) == 0:
        raise VarNotAvailableError('Variable {} is not available in specified '
                                   'time interval ({}-{})'.format(
                                       var_ref, start, stop))
    # make sure the gridded data is in the right dimension
    if gridded_data.ndim > 3:
        if vert_scheme is None:
            vert_scheme = 'mean'
        if not vert_scheme in gridded_data.SUPPORTED_VERT_SCHEMES:
            raise ValueError(
                'Vertical scheme {} is not supported'.format(vert_scheme))

    grid_stat_data = gridded_data.to_time_series(longitude=ungridded_lons,
                                                 latitude=ungridded_lats,
                                                 vert_scheme=vert_scheme)

    pd_freq = TsType(col_freq).to_pandas_freq()
    time_idx = make_datetime_index(start, stop, pd_freq)

    coldata = np.empty((2, len(time_idx), len(obs_stat_data)))

    lons = []
    lats = []
    alts = []
    station_names = []

    ungridded_unit = None
    ts_type_src_ref = None
    if not harmonise_units:
        gridded_unit = str(gridded_data.units)
    else:
        gridded_unit = None

    # loop over all stations and append to colocated data object
    for i, obs_stat in enumerate(obs_stat_data):
        # ToDo: consider removing to keep ts_type_src_ref (this was probably
        # introduced for EBAS were the original data frequency is not constant
        # but can vary from site to site)
        if ts_type_src_ref is None:
            ts_type_src_ref = obs_stat['ts_type_src']
        elif obs_stat['ts_type_src'] != ts_type_src_ref:
            spl = ts_type_src_ref.split(';')
            if not obs_stat['ts_type_src'] in spl:
                spl.append(obs_stat['ts_type_src'])
            ts_type_src_ref = ';'.join(spl)

        if ungridded_unit is None:
            try:
                ungridded_unit = obs_stat['var_info'][var_ref]['units']
            except KeyError as e:  #variable information or unit is not defined
                logger.exception(repr(e))
        try:
            unit = obs_stat['var_info'][var_ref]['units']
        except Exception:
            unit = None
        if not unit == ungridded_unit:
            raise ValueError(
                'Cannot perform colocation. Ungridded data '
                'object contains different units ({})'.format(var_ref))
        # get observations (Note: the index of the observation time series
        # is already in the specified frequency format, and thus, does not
        # need to be updated, for details (or if errors occur), cf.
        # UngriddedData.to_station_data, where the conversion happens)

        # get model station data
        grid_stat = grid_stat_data[i]
        if harmonise_units:
            grid_unit = grid_stat.get_unit(var)
            obs_unit = obs_stat.get_unit(var_ref)
            if not grid_unit == obs_unit:
                grid_stat.convert_unit(var, obs_unit)
            if gridded_unit is None:
                gridded_unit = obs_unit

        if remove_outliers and not var_keep_outliers:
            # don't check if harmonise_units is active, because the
            # remove_outliers method checks units based on AeroCom default
            # variables, and a variable mapping might be active, i.e.
            # sometimes models use abs550aer for absorption coefficients
            # with units [m-1] and not for AAOD (which is the AeroCom default
            # and unitless. Hence, unit check in remove_outliers works only
            # if the variable name (and unit) corresonds to AeroCom default)
            #chk_unit = not harmonise_units
            grid_stat.remove_outliers(var, low=low, high=high, check_unit=True)

        _df = _colocate_site_data_helper(
            stat_data=grid_stat,
            stat_data_ref=obs_stat,
            var=var,
            var_ref=var_ref,
            ts_type=col_freq,
            resample_how=resample_how,
            apply_time_resampling_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            use_climatology_ref=use_climatology_ref)

        # this try/except block was introduced on 23/2/2021 as temporary fix from
        # v0.10.0 -> v0.10.1 as a result of multi-weekly obsdata (EBAS) that
        # can end up resulting in incorrect number of timestamps after resampling
        # (the error was discovered using EBASMC, concpm10, 2019 and colocation
        # frequency monthly)
        try:
            # assign the unified timeseries data to the colocated data array
            coldata[0, :, i] = _df['ref'].values
            coldata[1, :, i] = _df['data'].values
        except ValueError as e:
            const.print_log.warning(
                f'Failed to colocate time for station {obs_stat.station_name}. '
                f'This station will be skipped (error: {e})')

        lons.append(obs_stat.longitude)
        lats.append(obs_stat.latitude)
        alts.append(obs_stat.altitude)
        station_names.append(obs_stat.station_name)

    try:
        revision = ungridded_data.data_revision[dataset_ref]
    except Exception:
        try:
            revision = ungridded_data._get_data_revision_helper(dataset_ref)
        except MetaDataError:
            revision = 'MULTIPLE'
        except Exception:
            revision = 'n/a'

    files = [os.path.basename(x) for x in gridded_data.from_files]

    meta = {
        'data_source': [dataset_ref, gridded_data.name],
        'var_name': [var_ref, var],
        'ts_type': col_freq,  # will be updated below if resampling
        'filter_name': filter_name,
        'ts_type_src': [ts_type_src_ref, grid_ts_type_src],
        'start_str': to_datestring_YYYYMMDD(start),
        'stop_str': to_datestring_YYYYMMDD(stop),
        'var_units': [ungridded_unit, gridded_unit],
        'vert_scheme': vert_scheme,
        'data_level': 3,
        'revision_ref': revision,
        'from_files': files,
        'from_files_ref': None,
        'stations_ignored': ignore_station_names,
        'colocate_time': colocate_time,
        'obs_is_clim': use_climatology_ref,
        'pyaerocom': pya_ver,
        'apply_constraints': apply_time_resampling_constraints,
        'min_num_obs': min_num_obs,
        'outliers_removed': remove_outliers
    }

    meta.update(regfilter.to_dict())

    # create coordinates of DataArray
    coords = {
        'data_source': meta['data_source'],
        'var_name': ('data_source', meta['var_name']),
        'var_units': ('data_source', meta['var_units']),
        'ts_type_src': ('data_source', meta['ts_type_src']),
        'time': time_idx,
        'station_name': station_names,
        'latitude': ('station_name', lats),
        'longitude': ('station_name', lons),
        'altitude': ('station_name', alts)
    }

    dims = ['data_source', 'time', 'station_name']
    data = ColocatedData(data=coldata,
                         coords=coords,
                         dims=dims,
                         name=var,
                         attrs=meta)

    # add correct units for lat / lon dimensions
    data.latitude.attrs['standard_name'] = gridded_data.latitude.standard_name
    data.latitude.attrs['units'] = str(gridded_data.latitude.units)

    data.longitude.attrs[
        'standard_name'] = gridded_data.longitude.standard_name
    data.longitude.attrs['units'] = str(gridded_data.longitude.units)

    if col_freq != str(ts_type):
        data = data.resample_time(
            to_ts_type=ts_type,
            colocate_time=colocate_time,
            apply_constraints=apply_time_resampling_constraints,
            min_num_obs=min_num_obs,
            how=resample_how,
            **kwargs)
    return data
Пример #7
0
 def check_and_load(self, var_name):
     """Check if cache file exists and load
     
     Note
     ----
     If a cache file exists for this database, but cannot be loaded or is
     outdated against pyaerocom updates, then it will be removed (the latter
     only if :attr:`pyaerocom.const.RM_CACHE_OUTDATED` is True).
     
     Returns
     -------
     bool
         True, if cache file exists and could be successfully loaded, else
         False. Note: if import is successful, the corresponding data object
         (instance of :class:`pyaerocom.UngriddedData` can be accessed via
         :attr:`loaded_data'
         
     Raises
     ------
     TypeError
         if cached file is not an instance of :class:`pyaerocom.UngriddedData` 
         class (which should not happen)
     """
     try:
         fp = self.file_path(var_name)
     except FileNotFoundError as e:
         logger.warning(repr(e))
         return False
     
     if not os.path.isfile(fp):
         logger.info('No cache file available for {}, {}'
                     .format(self.dataset_to_read, var_name))
         return False
 
     
     delete_existing = const.RM_CACHE_OUTDATED
             
     in_handle = open(fp, 'rb')
     
     
     try:
         ok = self._check_pkl_head_vs_database(in_handle)
     except Exception as e:
         ok = False
         delete_existing = True
         logger.exception('File error in cached data file {}. File will '
                          'be removed and data reloaded'
                          'Error: {}'.format(fp, repr(e)))
     if not ok:
         # TODO: Should we delete the cache file if it is outdated ???
         logger.info('Aborting reading cache file {}. Aerocom database '
                     'or pyaerocom version has changed compared to '
                     'cached version'
                     .format(self.file_name(var_name)))
         in_handle.close()
         if delete_existing: #something was wrong
             const.print_log.info('Deleting outdated cache file: {}'
                                  .format(fp))
             os.remove(self.file_path(var_name))
         return False
     
     # everything is okay
     data = pickle.load(in_handle)
     if not isinstance(data, UngriddedData):
         raise TypeError('Unexpected data type stored in cache file, need '
                         'instance of UngriddedData, got {}'
                         .format(type(data)))
         
     self.loaded_data[var_name] = data
     logger.info('Successfully loaded data for {} from Cache'
                 .format(self.dataset_to_read))
     return True
Пример #8
0
class CacheHandlerUngridded(object):
    """Interface for reading and writing of cache files
    
    Cache filename mask is 
    
    <dataset_to_read>_<var>.pkl
    
    e.g. EBASMC_scatc550aer.pkl
    
    Attributes
    ----------
    reader : ReadUngriddedBase
        reading class for dataset
    loaded_data : dict
        dictionary containing successfully loaded instances of single variable
        :class:`UngriddedData` objects (keys are variable names)
    """
    __version__ = '1.00'
    #: Directory of cache files
    try:
        CACHE_DIR = const.CACHEDIR
    except:
        CACHE_DIR = None
        logger.exception('Pyaerocom cache directory is not defined')
    #: Cache file header keys that are checked (and required unchanged) when
    #: reading a cache file
    CACHE_HEAD_KEYS = ['pyaerocom_version',
                       'newest_file_in_read_dir',
                       'newest_file_date_in_read_dir',
                       'data_revision', 
                       'reader_version', 
                       'ungridded_data_version', 
                       'cacher_version']
    
    def __init__(self, reader=None, cache_dir=None, **kwargs):
        self._reader = None
        self.reader = reader
        
        self.loaded_data = {}
        
        self._cache_dir = cache_dir
        
    @property
    def reader(self):
        """Instance of reader class"""
        if self._reader is None:
            raise AttributeError('No reader class assigned to cache object')
        return self._reader
    
    @reader.setter
    def reader(self, val):
        from pyaerocom.io import ReadUngriddedBase
        if not isinstance(val, ReadUngriddedBase):
            try:
                val = val.get_reader()
                if not isinstance(val, ReadUngriddedBase):
                    raise TypeError('Invalid input for reader')
            except:
                raise TypeError('Invalid input for reader')
        self._reader = val
        self.loaded_data = {}
        
    @property
    def cache_dir(self):
        """Directory where cached files are stored"""
        if self._cache_dir is not None:
            return self._cache_dir
        if self.CACHE_DIR is None or not os.path.exists(self.CACHE_DIR):
            raise FileNotFoundError('Cache directory does not exist: {}'
                                    .format(self.CACHE_DIR))
        return self.CACHE_DIR
    
    @cache_dir.setter
    def cache_dir(self, val):
        if not isinstance(val, str) or not os.path.exists(val):
            raise FileNotFoundError('Input directory does not exist: {}'
                                    .format(val))
        self._cache_dir = val
        
    @property
    def dataset_to_read(self):
        """Data ID of the associated dataset"""
        return self.reader.dataset_to_read
    
    @property
    def data_dir(self):
        """Data directory of the associated dataset"""
        return self.reader.DATASET_PATH        
        
    def file_name(self, var_name):
        """File name of cache file"""
        name = '_'.join([self.dataset_to_read, var_name])
        return name + '.pkl'
    
    def file_path(self, var_name):
        """File path of cache file"""
        return os.path.join(self.cache_dir, self.file_name(var_name))
    
    def _check_pkl_head_vs_database(self, in_handle):
        current = self.cache_meta_info()
        head = pickle.load(in_handle)
        if not isinstance(head, dict):
            raise CacheReadError('Invalid cache file')
        for k, v in head.items():
            if not k in current:
                raise CacheReadError('Invalid cache header key: {}'.format(k))
            elif not v == current[k]:
                const.print_log.info('{} is outdated (value: {}). Current '
                                     'value: {}'.format(k, v, current[k]))
                return False
        return True
    
    def cache_meta_info(self):
        """Dictionary containing relevant caching meta-info"""
        try:
            newest = max(glob.iglob(os.path.join(self.data_dir, '*')), 
                         key=os.path.getctime)
            newest_date = os.path.getctime(newest)
        except Exception as e:
            raise AerocomConnectionError('Failed to establish connection to '
                                         'data server. Reason: {}'.repr(e))
        d = dict.fromkeys(self.CACHE_HEAD_KEYS)
        from pyaerocom import __version__
        
        d['pyaerocom_version'] = __version__
        d['newest_file_in_read_dir'] = newest
        d['newest_file_date_in_read_dir'] = newest_date
        d['data_revision'] = self.reader.data_revision
        d['reader_version'] = self.reader.__version__
        d['ungridded_data_version'] = UngriddedData.__version__ 
        d['cacher_version'] = self.__version__
        return d
    
    def check_and_load(self, var_name):
        """Check if cache file exists and load
        
        Note
        ----
        If a cache file exists for this database, but cannot be loaded or is
        outdated against pyaerocom updates, then it will be removed (the latter
        only if :attr:`pyaerocom.const.RM_CACHE_OUTDATED` is True).
        
        Returns
        -------
        bool
            True, if cache file exists and could be successfully loaded, else
            False. Note: if import is successful, the corresponding data object
            (instance of :class:`pyaerocom.UngriddedData` can be accessed via
            :attr:`loaded_data'
            
        Raises
        ------
        TypeError
            if cached file is not an instance of :class:`pyaerocom.UngriddedData` 
            class (which should not happen)
        """
        try:
            fp = self.file_path(var_name)
        except FileNotFoundError as e:
            logger.warning(repr(e))
            return False
        
        if not os.path.isfile(fp):
            logger.info('No cache file available for {}, {}'
                        .format(self.dataset_to_read, var_name))
            return False
    
        
        delete_existing = const.RM_CACHE_OUTDATED
                
        in_handle = open(fp, 'rb')
        
        
        try:
            ok = self._check_pkl_head_vs_database(in_handle)
        except Exception as e:
            ok = False
            delete_existing = True
            logger.exception('File error in cached data file {}. File will '
                             'be removed and data reloaded'
                             'Error: {}'.format(fp, repr(e)))
        if not ok:
            # TODO: Should we delete the cache file if it is outdated ???
            logger.info('Aborting reading cache file {}. Aerocom database '
                        'or pyaerocom version has changed compared to '
                        'cached version'
                        .format(self.file_name(var_name)))
            in_handle.close()
            if delete_existing: #something was wrong
                const.print_log.info('Deleting outdated cache file: {}'
                                     .format(fp))
                os.remove(self.file_path(var_name))
            return False
        
        # everything is okay
        data = pickle.load(in_handle)
        if not isinstance(data, UngriddedData):
            raise TypeError('Unexpected data type stored in cache file, need '
                            'instance of UngriddedData, got {}'
                            .format(type(data)))
            
        self.loaded_data[var_name] = data
        logger.info('Successfully loaded data for {} from Cache'
                    .format(self.dataset_to_read))
        return True
    
    def write(self, data, var_name=None):
        """Write single-variable instance of UngriddedData to cache
        
        Parameters
        ----------
        data : UngriddedData
            object containing the data (possibly containing multiple variables)
        var_name : str, optional
            name of variable that is supposed to be stored (only required if
            input `data` contains more than one variable)
        """
        meta = self.cache_meta_info()
        
        if not isinstance(data, UngriddedData):
            raise TypeError('Invalid input, need instance of UngriddedData, '
                            'got {}'.format(type(data)))
        if len(data.contains_datasets) > 1:
            raise CacheWriteError('Input UngriddedData object contains '
                                  'datasets: {}. Can only write single '
                                  'dataset objects'
                                  .format(data.contains_datasets))
        if var_name is None:
            if len(data.contains_vars) > 1:
                raise CacheWriteError('Input UngriddedData object for {} contains '
                                      'more than one variable: {}. Please '
                                      'specify which variable should be '
                                      'cached'
                                      .format(self.reader.data_id,
                                              data.contains_vars))
            var_name = data.contains_vars[0]
        
        elif not var_name in data.contains_vars:
            raise CacheWriteError('Cannot write cache file: variable {} does '
                                  'not exist in input UngriddedData object'
                                  .format(var_name))
            
        if len(data.contains_vars) > 1:
            data = data.extract_var(var_name)
            
        fp = self.file_path(var_name)
        logger.info('Writing cache file: {}'.format(fp))
        success = True
        # OutHandle = gzip.open(c__cache_file, 'wb') # takes too much time
        out_handle = open(fp, 'wb')
        
        try:
            # write cache header
            pickle.dump(meta, out_handle, pickle.HIGHEST_PROTOCOL)
            # write data
            pickle.dump(data, out_handle, pickle.HIGHEST_PROTOCOL)
    
        except Exception as e:
            from pyaerocom import print_log
            print_log.exception('Failed to write cache'.format(repr(e)))
            success=False
        finally:    
            out_handle.close()
            if not success:
                os.remove(self.file_path)
        logger.info('Successfully wrote {} data ({}) to disk!'
                    .format(var_name, self.reader.data_id))
        
    def __str__(self):
        return 'Cache handler for {}'.format(self.reader.data_id)