def get_vars_supported(self, obs_id, vars_desired): """ Filter input list of variables by supported ones for a certain data ID Parameters ---------- obs_id : str ID of observation network vars_desired : list List of variables that are desired Returns ------- list list of variables that can be read through the input network """ obs_vars = [] if isinstance(vars_desired, str): vars_desired = [vars_desired] if obs_id in self.post_compute: # check if all required are accessible postinfo = self.post_compute[obs_id] for var in varlist_aerocom(vars_desired): if not var in postinfo['vars_supported']: continue requires = postinfo['aux_requires'][var] all_good = True for ds, vars_required in requires.items(): if isinstance(vars_required, str): vars_required = [vars_required] vars_avail = self.get_vars_supported(ds, vars_required) if not len(vars_required) == len(vars_avail): all_good = False break if all_good: obs_vars.append(var) else: # check if variable can be read from a dataset on disk _oreader = self.get_reader(obs_id) for var in varlist_aerocom(vars_desired): if _oreader.var_supported(var): obs_vars.append(var) return obs_vars
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None, file_pattern=None, common_meta=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) common_meta : dict, optional dictionary that contains additional metadata shared for this network (assigned to each metadata block of the :class:`UngriddedData` object that is returned) Returns ------- UngriddedData data object """ if common_meta is None: common_meta = {} if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] vars_to_retrieve = varlist_aerocom(vars_to_retrieve) if files is None: if len(self.files) == 0: self.get_file_list(pattern=file_pattern) files = self.files if file_pattern is None: if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = 0.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx num_vars = len(vars_to_retrieve) num_files = len(files) print_log.info('Reading AERONET data') for i in tqdm(range(num_files)): _file = files[i] station_data = self.read_file(_file, vars_to_retrieve=vars_to_retrieve) # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot meta = od() meta['var_info'] = od() meta.update(station_data.get_meta()) #metadata[meta_key].update(station_data.get_station_coords()) meta['data_id'] = self.data_id meta['ts_type'] = self.TS_TYPE #meta['variables'] = vars_to_retrieve if 'instrument_name' in station_data and station_data['instrument_name'] is not None: instr = station_data['instrument_name'] else: instr = self.INSTRUMENT_NAME meta['instrument_name'] = instr meta['data_revision'] = self.data_revision meta['filename'] = _file meta.update(**common_meta) # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() num_times = len(station_data['dtime']) #access array containing time stamps # TODO: check using index instead (even though not a problem here # since all Aerocom data files are of type timeseries) times = np.float64(station_data['dtime']) totnum = num_times * num_vars #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(totnum) for var_idx, var in enumerate(vars_to_retrieve): values = station_data[var] start = idx + var_idx * num_times stop = start + num_times #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = station_data['latitude'] data_obj._data[start:stop, data_obj._LONINDEX] = station_data['longitude'] data_obj._data[start:stop, data_obj._ALTITUDEINDEX] = station_data['altitude'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = times data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) if var in station_data['var_info']: if 'units' in station_data['var_info'][var]: u = station_data['var_info'][var]['units'] elif 'unit' in station_data['var_info'][var]: from pyaerocom.exceptions import MetaDataError raise MetaDataError('Metadata attr unit is deprecated, ' 'please use units') else: u = self.DEFAULT_UNIT elif var in self.UNITS: u = self.UNITS[var] else: u = self.DEFAULT_UNIT meta['var_info'][var] = od(units=u) if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx idx += totnum metadata[meta_key] = meta meta_key = meta_key + 1. # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] #data_obj.data_revision[self.data_id] = self.data_revision self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None, pattern=None, check_time=True, **kwargs): """Read data files into `UngriddedData` object Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] # make sure to use AeroCom variable names in output data vars_to_retrieve = varlist_aerocom(vars_to_retrieve) vars_to_read, vars_to_compute = self.check_vars_to_retrieve(vars_to_retrieve) if files is None: files = self.get_file_list(vars_to_read, pattern=pattern) elif isinstance(files, str): files = [files] if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] data_obj = UngriddedData(num_points=1000000) meta_key = -1.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx var_count_glob = -1 rename = self.var_names_data_inv from tqdm import tqdm for i in tqdm(range(len(files))): _file = files[i] metafile = self.get_meta_filename(_file) var_to_read = metafile['var_name'] begin = metafile['start'] end = metafile['stop'] var_read = rename[var_to_read] stats = self.read_file(_file, var_to_read=var_to_read, var_to_write=var_read, **kwargs) stats, added = self.compute_additional_vars(stats, vars_to_compute) if len(stats) == 0: const.logger.info('File {} does not contain any of the input ' 'variables {}' .format(_file, vars_to_retrieve)) vars_avail = [var_read] + added vars_to_add = list(np.intersect1d(vars_to_retrieve, vars_avail)) if len(vars_to_add) == 0: continue chunksize = 500000 for stat in stats: meta_key += 1 meta_idx[meta_key] = {} meta = stat['meta'] vi = meta['var_info'] meta['var_info'] = {} metadata[meta_key] = meta metadata[meta_key]['data_id'] = self.data_id # duplicate for now metadata[meta_key]['instrument_name'] = meta['measuring_instrument_name'] statname = metadata[meta_key]['station_name'] if '/' in statname: statname = statname.replace('/','-') metadata[meta_key]['station_name'] = statname times = stat['time'].astype('datetime64[s]') timenums = np.float64(times) if check_time and (begin > times[0] or end < times[-1]): raise ValueError('Something seems to be off with time ' 'dimension...') num_vars = len(vars_to_add) num_times = len(times) totnum = num_times * num_vars #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(chunksize) for j, var_to_write in enumerate(vars_to_add): values = stat[var_to_write] start = idx + j*num_times stop = start + num_times if not var_to_write in data_obj.var_idx: var_count_glob += 1 var_idx = var_count_glob data_obj.var_idx[var_to_write] = var_idx else: var_idx = data_obj.var_idx[var_to_write] meta['var_info'][var_to_write] = vi[var_to_write] #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = meta['latitude'] data_obj._data[start:stop, data_obj._LONINDEX] = meta['longitude'] data_obj._data[start:stop, data_obj._ALTITUDEINDEX] = meta['altitude'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = timenums data_obj._data[start:stop, data_obj._DATAINDEX] = values # add invalid measurements invalid = stat['data_flagged'][var_to_write] data_obj._data[start:stop, data_obj._DATAFLAGINDEX] = invalid data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var_to_write] = np.arange(start, stop) idx += totnum data_obj._data = data_obj._data[:idx] data_obj._check_index() return data_obj
def read_dataset(self, dataset_to_read, vars_to_retrieve=None, only_cached=False, **kwargs): """Read dataset into an instance of :class:`ReadUngridded` Parameters ---------- dataset_to_read : str name of dataset vars_to_retrieve : str or list variable or list of variables to be imported only_cached : bool if True, then nothing is reloaded but only data is loaded that is available as cached objects (not recommended to use but may be used if working offline without connection to database) **kwargs additional reading constraints. If any are provided, caching is deactivated and the data will be read from disk. Returns -------- UngriddedData data object """ _caching = None if len(kwargs) > 0: _caching = const.CACHING const.CACHING = False print_log.info('Received additional reading constraints, ' 'ignoring caching') reader = self.get_reader(dataset_to_read) if vars_to_retrieve is not None: # Note: self.vars_to_retrieve may be None as well, then # default variables of each network are read self.vars_to_retrieve = vars_to_retrieve if self.vars_to_retrieve is None: self.vars_to_retrieve = reader.PROVIDES_VARIABLES vars_to_retrieve = varlist_aerocom(self.vars_to_retrieve) # data_dir will be None in most cases, but can be specified when # creating the instance, by default, data_dir is inferred automatically # in the reading class, using database location data_dir = self._get_data_dir(dataset_to_read) if data_dir is not None: if not os.path.exists(data_dir): raise FileNotFoundError( 'Trying to read {} from specified data_dir {} failed. ' 'Directory does not exist'.format(dataset_to_read, data_dir)) reader._dataset_path = data_dir const.print_log.info( 'Reading {} from specified data loaction: {}'.format( dataset_to_read, data_dir)) # Since this interface enables to load multiple datasets, each of # which support a number of variables, here, only the variables are # considered that are supported by the dataset vars_available = [ var for var in vars_to_retrieve if reader.var_supported(var) ] if len(vars_available) == 0: raise DataRetrievalError('None of the input variables ({}) is ' 'supported by {} interface'.format( vars_to_retrieve, dataset_to_read)) cache = CacheHandlerUngridded(reader) if not self.ignore_cache: # initate cache handler for var in vars_available: try: cache.check_and_load(var, force_use_outdated=only_cached) except Exception: self.logger.exception( 'Fatal: compatibility error between ' 'old cache file {} and current version ' 'of code ') if not only_cached: vars_to_read = [ v for v in vars_available if not v in cache.loaded_data ] else: vars_to_read = [] data_read = None if len(vars_to_read) > 0: _loglevel = print_log.level print_log.setLevel(logging.INFO) data_read = reader.read(vars_to_read, **kwargs) print_log.setLevel(_loglevel) for var in vars_to_read: # write the cache file if not self.ignore_cache: try: cache.write(data_read, var) except Exception as e: _caching = False print_log.warning( 'Failed to write to cache directory. ' 'Error: {}. Deactivating caching in ' 'pyaerocom'.format(repr(e))) if len(vars_to_read) == len(vars_available): data_out = data_read else: data_out = UngriddedData() for var in vars_available: if var in cache.loaded_data: data_out.append(cache.loaded_data[var]) if data_read is not None: data_out.append(data_read) if _caching is not None: const.CACHING = _caching return data_out
def check_vars_to_retrieve(self, vars_to_retrieve): """Separate variables that are in file from those that are computed Some of the provided variables by this interface are not included in the data files but are computed within this class during data import (e.g. od550aer, ang4487aer). The latter may require additional parameters to be retrieved from the file, which is specified in the class header (cf. attribute ``AUX_REQUIRES``). This function checks the input list that specifies all required variables and separates them into two lists, one that includes all variables that can be read from the files and a second list that specifies all variables that are computed in this class. Parameters ---------- vars_to_retrieve : list all parameter names that are supposed to be loaded Returns ------- tuple 2-element tuple, containing - list: list containing all variables to be read - list: list containing all variables to be computed """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] # first, check if input variables are alias names, and replace vars_to_retrieve = varlist_aerocom(vars_to_retrieve) repeat = True while repeat: repeat, vars_to_retrieve = self._add_additional_vars( vars_to_retrieve) # unique list containing all variables that are supposed to be read, # either because they are required to be retrieved, or because they # are supposed to be read because they are required to compute one # of the output variables vars_to_retrieve = list(dict.fromkeys(vars_to_retrieve)) # in the following, vars_to_retrieve is separated into two arrays, one # containing all variables that can be read from the files, and the # second containing all variables that are computed vars_to_read = [] vars_to_compute = [] for var in vars_to_retrieve: if not var in self.PROVIDES_VARIABLES: raise ValueError("Invalid variable {}".format(var)) elif var in self.AUX_REQUIRES: vars_to_compute.append(var) else: vars_to_read.append(var) return (vars_to_read, vars_to_compute)