def read(self, datasets_to_read=None, vars_to_retrieve=None, **kwargs): """Read observations Iter over all datasets in :attr:`datasets_to_read`, call :func:`read_dataset` and append to data object Example ------- >>> import pyaerocom.io.readungridded as pio >>> from pyaerocom import const >>> obj = pio.ReadUngridded(dataset_to_read=const.AERONET_SUN_V3L15_AOD_ALL_POINTS_NAME) >>> obj.read() >>> print(obj) >>> print(obj.metadata[0.]['latitude']) """ if datasets_to_read is not None: self.datasets_to_read = datasets_to_read if vars_to_retrieve is not None: self.vars_to_retrieve = vars_to_retrieve data = UngriddedData() for ds in self.datasets_to_read: self.logger.info('Reading {} data'.format(ds)) data.append(self.read_dataset(ds, vars_to_retrieve, **kwargs)) self.logger.info('Successfully imported {} data'.format(ds)) self.data = data return data
def read(self, datasets_to_read=None, vars_to_retrieve=None, only_cached=False, **kwargs): """Read observations Iter over all datasets in :attr:`datasets_to_read`, call :func:`read_dataset` and append to data object Parameters ---------- datasets_to_read : str or list data ID or list of all datasets to be imported vars_to_retrieve : str or list variable or list of variables to be imported only_cached : bool if True, then nothing is reloaded but only data is loaded that is available as cached objects (not recommended to use but may be used if working offline without connection to database) Example ------- >>> import pyaerocom.io.readungridded as pio >>> from pyaerocom import const >>> obj = pio.ReadUngridded(dataset_to_read=const.AERONET_SUN_V3L15_AOD_ALL_POINTS_NAME) >>> obj.read() >>> print(obj) >>> print(obj.metadata[0.]['latitude']) """ if datasets_to_read is not None: self.datasets_to_read = datasets_to_read if vars_to_retrieve is not None: self.vars_to_retrieve = vars_to_retrieve data = UngriddedData() for ds in self.datasets_to_read: read_vars = self._get_vars_to_retrieve(ds) self.logger.info('Reading {} data, variables: {}'.format( ds, read_vars)) if ds in self.post_compute: data.append( self.read_dataset_post(ds, read_vars, only_cached=only_cached, **kwargs)) else: data.append( self.read_dataset(ds, read_vars, only_cached=only_cached, **kwargs)) self.logger.info('Successfully imported {} data'.format(ds)) return data
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None, file_pattern=None, common_meta=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) common_meta : dict, optional dictionary that contains additional metadata shared for this network (assigned to each metadata block of the :class:`UngriddedData` object that is returned) Returns ------- UngriddedData data object """ if common_meta is None: common_meta = {} if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] vars_to_retrieve = varlist_aerocom(vars_to_retrieve) if files is None: if len(self.files) == 0: self.get_file_list(pattern=file_pattern) files = self.files if file_pattern is None: if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = 0.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx num_vars = len(vars_to_retrieve) num_files = len(files) print_log.info('Reading AERONET data') for i in tqdm(range(num_files)): _file = files[i] station_data = self.read_file(_file, vars_to_retrieve=vars_to_retrieve) # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot meta = od() meta['var_info'] = od() meta.update(station_data.get_meta()) #metadata[meta_key].update(station_data.get_station_coords()) meta['data_id'] = self.data_id meta['ts_type'] = self.TS_TYPE #meta['variables'] = vars_to_retrieve if 'instrument_name' in station_data and station_data['instrument_name'] is not None: instr = station_data['instrument_name'] else: instr = self.INSTRUMENT_NAME meta['instrument_name'] = instr meta['data_revision'] = self.data_revision meta['filename'] = _file meta.update(**common_meta) # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() num_times = len(station_data['dtime']) #access array containing time stamps # TODO: check using index instead (even though not a problem here # since all Aerocom data files are of type timeseries) times = np.float64(station_data['dtime']) totnum = num_times * num_vars #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(totnum) for var_idx, var in enumerate(vars_to_retrieve): values = station_data[var] start = idx + var_idx * num_times stop = start + num_times #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = station_data['latitude'] data_obj._data[start:stop, data_obj._LONINDEX] = station_data['longitude'] data_obj._data[start:stop, data_obj._ALTITUDEINDEX] = station_data['altitude'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = times data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) if var in station_data['var_info']: if 'units' in station_data['var_info'][var]: u = station_data['var_info'][var]['units'] elif 'unit' in station_data['var_info'][var]: from pyaerocom.exceptions import MetaDataError raise MetaDataError('Metadata attr unit is deprecated, ' 'please use units') else: u = self.DEFAULT_UNIT elif var in self.UNITS: u = self.UNITS[var] else: u = self.DEFAULT_UNIT meta['var_info'][var] = od(units=u) if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx idx += totnum metadata[meta_key] = meta meta_key = meta_key + 1. # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] #data_obj.data_revision[self.data_id] = self.data_revision self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, files=[], first_file=None, last_file=None, file_pattern=None, list_coda_paths=False, local_temp_dir=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) :param local_temp_dir: Returns ------- UngriddedData data object Example: >>> import pyaerocom as pya >>> obj = pya.io.read_aeolus_l2a_data.ReadL2Data() >>> testfiles = [] >>> testfiles.append('/lustre/storeB/project/fou/kl/admaeolus/data.rev.2A02/download/2018-12/01/AE_OPER_ALD_U_N_2A_20181201T033526026_005423993_001590_0001.TGZ') >>> data=obj.read(files=testfiles) >>> data=obj.read(files=testfiles, vars_to_retrieve='ec355aer') """ import pathlib import tarfile import os import coda if local_temp_dir is None: local_temp_dir = self.LOCAL_TMP_DIR if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if files is None: if len(self.files) == 0: self.get_file_list(pattern=file_pattern) files = self.files if file_pattern is None: if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] temp_files = {} data_obj = UngriddedData(num_points=self._COLNO, chunksize=self._CHUNKSIZE) meta_key = 0.0 idx = 0 # check if the supplied file is a supported archive file (tar in this case) # and extract the files with supported suffixes to const._cachedir non_archive_files = [] for idx, _file in enumerate(sorted(files)): # temp = 'reading file: {}'.format(_file) self.logger.info('file: {}'.format(_file)) suffix = pathlib.Path(_file).suffix if suffix in self.SUPPORTED_ARCHIVE_SUFFIXES: temp = 'opening archive file; using {} as temp dir.'.format(local_temp_dir) self.logger.info(temp) # untar archive files first tarhandle = tarfile.open(_file) files_in_tar = tarhandle.getnames() for file_in_tar in files_in_tar: if pathlib.Path(file_in_tar).suffix in self.SUPPORTED_SUFFIXES: # extract file to tmp path member = tarhandle.getmember(file_in_tar) temp = 'extracting file {}...'.format(member.name) self.logger.info(temp) tarhandle.extract(member, path=local_temp_dir, set_attrs=False) extract_file = os.path.join(local_temp_dir, member.name) non_archive_files.append(extract_file) temp_files[extract_file] = True tarhandle.close() else: non_archive_files.append(_file) for idx, _file in enumerate(sorted(non_archive_files)): # list coda data paths in the 1st file in case the user asked for that if idx == 0 and list_coda_paths: pass coda_handle = coda.open(_file) root_field_names = coda.get_field_names(coda_handle) for field in root_field_names: print(field) coda.close(coda_handle) data_obj = None return data_obj file_data = self.read_file(_file, vars_to_retrieve=vars_to_retrieve, loglevel=logging.INFO, return_as='numpy') self.logger.info('{} points read'.format(file_data.shape[0])) # the metadata dict is left empty for L2 data # the location in the data set is time step dependant! if idx == 0: data_obj._data = file_data else: data_obj._data = np.append(data_obj._data, file_data, axis=0) data_obj._idx = data_obj._data.shape[0] + 1 file_data = None # remove file if it was temporary one if _file in temp_files: os.remove(_file) # pass # tmp_obj = UngriddedData() # tmp_obj._data = file_data # tmp_obj._idx = data_obj._data.shape[0] + 1 # data_obj.append(tmp_obj) self.logger.info('size of data object: {}'.format(data_obj._idx - 1)) return data_obj
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None, pattern=None, check_time=True, **kwargs): """Read data files into `UngriddedData` object Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] # make sure to use AeroCom variable names in output data vars_to_retrieve = varlist_aerocom(vars_to_retrieve) vars_to_read, vars_to_compute = self.check_vars_to_retrieve(vars_to_retrieve) if files is None: files = self.get_file_list(vars_to_read, pattern=pattern) elif isinstance(files, str): files = [files] if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] data_obj = UngriddedData(num_points=1000000) meta_key = -1.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx var_count_glob = -1 rename = self.var_names_data_inv from tqdm import tqdm for i in tqdm(range(len(files))): _file = files[i] metafile = self.get_meta_filename(_file) var_to_read = metafile['var_name'] begin = metafile['start'] end = metafile['stop'] var_read = rename[var_to_read] stats = self.read_file(_file, var_to_read=var_to_read, var_to_write=var_read, **kwargs) stats, added = self.compute_additional_vars(stats, vars_to_compute) if len(stats) == 0: const.logger.info('File {} does not contain any of the input ' 'variables {}' .format(_file, vars_to_retrieve)) vars_avail = [var_read] + added vars_to_add = list(np.intersect1d(vars_to_retrieve, vars_avail)) if len(vars_to_add) == 0: continue chunksize = 500000 for stat in stats: meta_key += 1 meta_idx[meta_key] = {} meta = stat['meta'] vi = meta['var_info'] meta['var_info'] = {} metadata[meta_key] = meta metadata[meta_key]['data_id'] = self.data_id # duplicate for now metadata[meta_key]['instrument_name'] = meta['measuring_instrument_name'] statname = metadata[meta_key]['station_name'] if '/' in statname: statname = statname.replace('/','-') metadata[meta_key]['station_name'] = statname times = stat['time'].astype('datetime64[s]') timenums = np.float64(times) if check_time and (begin > times[0] or end < times[-1]): raise ValueError('Something seems to be off with time ' 'dimension...') num_vars = len(vars_to_add) num_times = len(times) totnum = num_times * num_vars #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(chunksize) for j, var_to_write in enumerate(vars_to_add): values = stat[var_to_write] start = idx + j*num_times stop = start + num_times if not var_to_write in data_obj.var_idx: var_count_glob += 1 var_idx = var_count_glob data_obj.var_idx[var_to_write] = var_idx else: var_idx = data_obj.var_idx[var_to_write] meta['var_info'][var_to_write] = vi[var_to_write] #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = meta['latitude'] data_obj._data[start:stop, data_obj._LONINDEX] = meta['longitude'] data_obj._data[start:stop, data_obj._ALTITUDEINDEX] = meta['altitude'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = timenums data_obj._data[start:stop, data_obj._DATAINDEX] = values # add invalid measurements invalid = stat['data_flagged'][var_to_write] data_obj._data[start:stop, data_obj._DATAFLAGINDEX] = invalid data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var_to_write] = np.arange(start, stop) idx += totnum data_obj._data = data_obj._data[:idx] data_obj._check_index() return data_obj
def read_dataset_post(self, dataset_to_read, vars_to_retrieve, only_cached=False, **kwargs): """Read dataset into an instance of :class:`ReadUngridded` Parameters ---------- dataset_to_read : str name of dataset vars_to_retrieve : list variable or list of variables to be imported only_cached : bool if True, then nothing is reloaded but only data is loaded that is available as cached objects (not recommended to use but may be used if working offline without connection to database) **kwargs additional reading constraints. If any are provided, caching is deactivated and the data will be read from disk. Returns -------- UngriddedData data object """ aux_info = self.post_compute[dataset_to_read] loaded = [] for var in vars_to_retrieve: input_data_ids_vars = [] aux_info_var = aux_info['aux_requires'][var] for aux_id, aux_vars in aux_info_var.items(): if aux_id in self.post_compute: aux_data = self.read_dataset_post( dataset_to_read=aux_id, vars_to_retrieve=aux_vars, only_cached=only_cached, **kwargs) else: aux_data = self.read_dataset(aux_id, aux_vars, only_cached=only_cached, **kwargs) for aux_var in aux_vars: input_data_ids_vars.append((aux_data, aux_id, aux_var)) aux_merge_how = aux_info['aux_merge_how'][var] if var in aux_info['aux_units']: var_unit_out = aux_info['aux_units'][var] else: var_unit_out = None if aux_merge_how == 'eval': # function MUST be defined aux_fun = aux_info['aux_funs'][var] else: aux_fun = None merged_stats = combine_vardata_ungridded( data_ids_and_vars=input_data_ids_vars, merge_eval_fun=aux_fun, merge_how=aux_merge_how, var_name_out=var, var_unit_out=var_unit_out, data_id_out=aux_info['data_id']) loaded.append(UngriddedData.from_station_data(merged_stats)) first = loaded[0] if len(loaded) == 1: return first for data in loaded[1:]: first.append(data) return first
def read(self, vars_to_retrieve=None, files=[], first_file=None, last_file=None, file_pattern=None, list_coda_paths=False, local_temp_dir=None, return_as='numpy', apply_quality_flag=0.0): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) :param local_temp_dir: Returns ------- UngriddedData data object Example: >>> import pyaerocom as pya >>> obj = pya.io.read_aeolus_l2a_data.ReadL2Data() >>> testfiles = [] >>> testfiles.append('/lustre/storeB/project/fou/kl/admaeolus/data.rev.2A02/download/2018-12/01/AE_OPER_ALD_U_N_2A_20181201T033526026_005423993_001590_0001.TGZ') >>> data=obj.read(files=testfiles) >>> data=obj.read(files=testfiles, vars_to_retrieve='ec355aer') """ import pathlib import tarfile import os import coda if local_temp_dir is None: local_temp_dir = self.LOCAL_TMP_DIR if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if files is None: if len(self.files) == 0: self.get_file_list(pattern=file_pattern) files = self.files if file_pattern is None: if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] temp_files = {} data_obj = UngriddedData(num_points=self._CHUNKSIZE) meta_key = 0.0 idx = 0 # check if the supplied file is a supported archive file (tar in this case) # and extract the files with supported suffixes to const.CACHEDIR non_archive_files = [] for idx, _file in enumerate(sorted(files)): # temp = 'reading file: {}'.format(_file) self.logger.info('file: {}'.format(_file)) suffix = pathlib.Path(_file).suffix if suffix in self.SUPPORTED_ARCHIVE_SUFFIXES: temp = 'opening archive file; using {} as temp dir.'.format( local_temp_dir) self.logger.info(temp) # untar archive files first tarhandle = tarfile.open(_file) files_in_tar = tarhandle.getnames() for file_in_tar in files_in_tar: if pathlib.Path( file_in_tar).suffix in self.SUPPORTED_SUFFIXES: # extract file to tmp path member = tarhandle.getmember(file_in_tar) temp = 'extracting file {}...'.format(member.name) self.logger.info(temp) tarhandle.extract(member, path=local_temp_dir, set_attrs=False) extract_file = os.path.join(local_temp_dir, member.name) non_archive_files.append(extract_file) temp_files[extract_file] = True tarhandle.close() else: non_archive_files.append(_file) for idx, _file in enumerate(sorted(non_archive_files)): # list coda data paths in the 1st file in case the user asked for that if idx == 0 and list_coda_paths: pass coda_handle = coda.open(_file) root_field_names = coda.get_field_names(coda_handle) for field in root_field_names: print(field) coda.close(coda_handle) data_obj = None return data_obj file_data = self.read_file(_file, vars_to_retrieve=vars_to_retrieve, loglevel=logging.INFO, return_as=return_as) if return_as == 'numpy': self.logger.info('{} points read'.format(file_data.shape[0])) # the metadata dict is left empty for L2 data # the location in the data set is time step dependant! if idx == 0: data_obj._data = file_data else: data_obj._data = np.append(data_obj._data, file_data, axis=0) data_obj._idx = data_obj._data.shape[0] + 1 file_data = None # remove file if it was temporary one if _file in temp_files: os.remove(_file) # pass # tmp_obj = UngriddedData() # tmp_obj._data = file_data # tmp_obj._idx = data_obj._data.shape[0] + 1 # data_obj.append(tmp_obj) self.logger.info( 'size of data object: {}'.format(data_obj._idx - 1)) elif return_as == 'dict': if idx == 0: data_obj._data = {} shape_store = {} index_store = {} file_start_index_arr = [0] # apply quality flags if apply_quality_flag > 0.: qflags = file_data[self._QANAME] keep_indexes = np.where(qflags >= apply_quality_flag) elements_to_add = keep_indexes.size else: keep_indexes = np.arange(0, len(file_data[self._QANAME])) elements_to_add = file_data[self._QANAME].shape[0] for _key in file_data: # print('key: {}'.format(_key)) shape_store[_key] = file_data[_key].shape index_store[_key] = file_data[_key].shape[0] input_shape = list(file_data[_key].shape) input_shape[0] = self._ROWNO data_obj._data[_key] = np.empty(input_shape, dtype=np.float_) if len(input_shape) == 1: data_obj._data[_key][0:file_data[_key]. shape[0]] = file_data[_key] elif len(input_shape) == 2: data_obj._data[_key][0:file_data[_key]. shape[0], :] = file_data[_key] elif len(input_shape) == 3: data_obj._data[_key][ 0:file_data[_key]. shape[0], :, :] = file_data[_key] elif len(input_shape) == 4: data_obj._data[_key][ 0:file_data[_key]. shape[0], :, :, :] = file_data[_key] else: pass # 2nd + file else: if apply_quality_flag > 0.: qflags = file_data[self._QANAME] keep_indexes = np.where(qflags >= apply_quality_flag) elements_to_add = keep_indexes.size file_start_index_arr.append( file_data[self.TSSIZENAME].shape[0]) for _key in file_data: if _key in self.STATICFIELDNAMES: print('key: {}'.format(_key)) continue # shape_store[_key] = file_data[_key].shape elements_to_add = file_data[_key].shape[0] # extend data_obj._data[_key] if necessary if index_store[_key] + elements_to_add > data_obj._data[ _key].shape[0]: current_shape = list(data_obj._data[_key].shape) current_shape[ 0] = current_shape[0] + self._CHUNKSIZE tmp_data = np.empty(current_shape, dtype=np.float_) if len(current_shape) == 1: tmp_data[0:data_obj._data[_key]. shape[0]] = data_obj._data[_key] elif len(current_shape) == 2: tmp_data[0:data_obj._data[_key]. shape[0], :] = data_obj._data[_key] elif len(current_shape) == 3: tmp_data[0:data_obj._data[_key]. shape[0], :, :] = data_obj._data[_key] elif len(current_shape) == 4: tmp_data[ 0:data_obj._data[_key]. shape[0], :, :, :] = data_obj._data[_key] else: pass input_shape = list(file_data[_key].shape) if len(input_shape) == 1: data_obj._data[_key][ index_store[_key]:index_store[_key] + file_data[_key].shape[0]] = file_data[_key] elif len(input_shape) == 2: data_obj._data[_key][ index_store[_key]:index_store[_key] + file_data[_key].shape[0], :] = file_data[_key] elif len(input_shape) == 3: data_obj._data[_key][ index_store[_key]:index_store[_key] + file_data[_key]. shape[0], :, :] = file_data[_key] elif len(input_shape) == 4: data_obj._data[_key][ index_store[_key]:index_store[_key] + file_data[_key]. shape[0], :, :, :] = file_data[_key] else: pass index_store[_key] += elements_to_add file_data = None # remove file if it was temporary one if _file in temp_files: os.remove(_file) else: pass # now shorten the data dict to the necessary size if return_as == 'dict': for _key in data_obj._data: data_obj._data[_key] = data_obj._data[_key][:index_store[_key]] data_obj._data['file_indexes'] = file_start_index_arr # apply the quality flags if apply_quality_flag > 0.: pass return data_obj
def read_dataset(self, dataset_to_read, vars_to_retrieve=None, only_cached=False, **kwargs): """Read dataset into an instance of :class:`ReadUngridded` Parameters ---------- dataset_to_read : str name of dataset vars_to_retrieve : str or list variable or list of variables to be imported only_cached : bool if True, then nothing is reloaded but only data is loaded that is available as cached objects (not recommended to use but may be used if working offline without connection to database) **kwargs additional reading constraints. If any are provided, caching is deactivated and the data will be read from disk. Returns -------- UngriddedData data object """ _caching = None if len(kwargs) > 0: _caching = const.CACHING const.CACHING = False print_log.info('Received additional reading constraints, ' 'ignoring caching') reader = self.get_reader(dataset_to_read) if vars_to_retrieve is not None: # Note: self.vars_to_retrieve may be None as well, then # default variables of each network are read self.vars_to_retrieve = vars_to_retrieve if self.vars_to_retrieve is None: self.vars_to_retrieve = reader.PROVIDES_VARIABLES vars_to_retrieve = varlist_aerocom(self.vars_to_retrieve) # data_dir will be None in most cases, but can be specified when # creating the instance, by default, data_dir is inferred automatically # in the reading class, using database location data_dir = self._get_data_dir(dataset_to_read) if data_dir is not None: if not os.path.exists(data_dir): raise FileNotFoundError( 'Trying to read {} from specified data_dir {} failed. ' 'Directory does not exist'.format(dataset_to_read, data_dir)) reader._dataset_path = data_dir const.print_log.info( 'Reading {} from specified data loaction: {}'.format( dataset_to_read, data_dir)) # Since this interface enables to load multiple datasets, each of # which support a number of variables, here, only the variables are # considered that are supported by the dataset vars_available = [ var for var in vars_to_retrieve if reader.var_supported(var) ] if len(vars_available) == 0: raise DataRetrievalError('None of the input variables ({}) is ' 'supported by {} interface'.format( vars_to_retrieve, dataset_to_read)) cache = CacheHandlerUngridded(reader) if not self.ignore_cache: # initate cache handler for var in vars_available: try: cache.check_and_load(var, force_use_outdated=only_cached) except Exception: self.logger.exception( 'Fatal: compatibility error between ' 'old cache file {} and current version ' 'of code ') if not only_cached: vars_to_read = [ v for v in vars_available if not v in cache.loaded_data ] else: vars_to_read = [] data_read = None if len(vars_to_read) > 0: _loglevel = print_log.level print_log.setLevel(logging.INFO) data_read = reader.read(vars_to_read, **kwargs) print_log.setLevel(_loglevel) for var in vars_to_read: # write the cache file if not self.ignore_cache: try: cache.write(data_read, var) except Exception as e: _caching = False print_log.warning( 'Failed to write to cache directory. ' 'Error: {}. Deactivating caching in ' 'pyaerocom'.format(repr(e))) if len(vars_to_read) == len(vars_available): data_out = data_read else: data_out = UngriddedData() for var in vars_available: if var in cache.loaded_data: data_out.append(cache.loaded_data[var]) if data_read is not None: data_out.append(data_read) if _caching is not None: const.CACHING = _caching return data_out
def read(self, vars_to_retrieve=None): """ Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or `str`:, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. Returns ------- UngriddedData : :class:`UngriddedData` data object """ files = self.get_file_list() if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] data_obj = UngriddedData() meta_key = 0.0 idx = 0 varindex = -1 #assign metadata object metadata = data_obj.metadata # OrderedDict meta_idx = data_obj.meta_idx # OrderedDict for file in files: filename = os.path.basename(file) if not filename in self.FILES_CONTAIN: raise IOError( 'Invalid file name {}, this should not happen.'.format( filename)) var_matches = [ var for var in vars_to_retrieve if var in self.FILES_CONTAIN[filename] ] if len(var_matches) == 0: continue stat_list = self.read_file(file, vars_to_retrieve=var_matches) for stat in stat_list: #self.counter += 1 metadata[meta_key] = OrderedDict() metadata[meta_key].update(stat.get_meta()) metadata[meta_key].update(stat.get_station_coords()) metadata[meta_key]['data_id'] = self.data_id metadata[meta_key]['ts_type'] = self.TS_TYPE #metadata[meta_key]['variables'] = stat["variables"] # Is instrumentname if 'instrument_name' in stat and stat[ 'instrument_name'] is not None: instr = stat['instrument_name'] else: instr = self.INSTRUMENT_NAME metadata[meta_key]['instrument_name'] = instr metadata[meta_key]['data_revision'] = self.data_revision # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = OrderedDict() num_times = len(stat['dtime']) num_vars = len(stat["var_info"]) temp_vars = list(stat["var_info"].keys()) tconv = stat['dtime'].astype('datetime64[s]') times = np.float64(tconv) totnum = num_times * num_vars if (idx + totnum) >= data_obj._ROWNO: # This results in a error because it doesn't want to multiply empty with nan data_obj.add_chunk(totnum) metadata[meta_key]['var_info'] = OrderedDict() for var_count, var in enumerate(temp_vars): values = stat[var] start = idx + var_count * num_times stop = start + num_times if not var in data_obj.var_idx: varindex += 1 data_obj.var_idx[var] = varindex var_idx = varindex else: var_idx = data_obj.var_idx[var] metadata[meta_key]['var_info'][var] = stat['var_info'][var] data_obj._data[start:stop, data_obj._LATINDEX] = stat['latitude'] data_obj._data[start:stop, data_obj._LONINDEX] = stat['longitude'] data_obj._data[start:stop, data_obj._ALTITUDEINDEX] = stat['altitude'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = times data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) meta_key += 1 idx += totnum data_obj._data = data_obj._data[:idx] # sanity check data_obj._check_index() self.data = data_obj # initalizing a pointer to it selves return data_obj
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional List containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded. files : :obj:`list`, optional List of files to be read. If None, then the file list used is the returned from :func:`get_file_list`. first_file : :obj:`int`, optional Index of the first file in :obj:'file' to be read. If None, the very first file in the list is used. last_file : :obj:`int`, optional Index of the last file in :obj:'file' to be read. If None, the very last file in the list is used. Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if files is None: if len(self.files) == 0: self.get_file_list() files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] data_obj = UngriddedData() meta_key = 0.0 idx = 0 # Assign metadata object and index metadata = data_obj.metadata meta_idx = data_obj.meta_idx for i, _file in enumerate(files): station_data = self.read_file(_file, vars_to_retrieve=vars_to_retrieve) # only the variables in the file num_vars = len(station_data.var_info.keys()) # Fill the metadata dict. # The location in the data set is time step dependant metadata[meta_key] = od() metadata[meta_key].update(station_data.get_meta()) metadata[meta_key].update(station_data.get_station_coords()) metadata[meta_key]['variables'] = list( station_data.var_info.keys()) #vars_to_retrieve if ('instrument_name' in station_data and station_data['instrument_name'] is not None): instr = station_data['instrument_name'] else: instr = self.INSTRUMENT_NAME metadata[meta_key]['instrument_name'] = instr metadata[meta_key]['var_info'] = station_data['var_info'] # List with indices of this station for each variable meta_idx[meta_key] = od() num_times = len(station_data['dtime']) totnum = num_times * num_vars # Check whether the size of the data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: # if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(totnum) for var_idx, var in enumerate(list(station_data.var_info)): values = station_data[var] start = idx + var_idx * num_times stop = start + num_times # Write common meta info for this station (data lon, lat and # altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = station_data['latitude'] data_obj._data[start:stop, data_obj._LONINDEX] = station_data['longitude'] data_obj._data[ start:stop, data_obj._ALTITUDEINDEX] = station_data['altitude'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key data_obj._data[ start:stop, data_obj._DATAHEIGHTINDEX] = station_data['dataaltitude'] data_obj._data[start:stop, data_obj._DATAERRINDEX] = station_data['sd'] data_obj._data[start:stop, data_obj._DATAFLAGINDEX] = station_data['f'] data_obj._data[start:stop, data_obj._TIMEINDEX] = station_data['dtime'] data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx idx += totnum meta_key = meta_key + 1. # Shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] #data_obj.data_revision[self.DATASET_NAME] = self.data_revision self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None, read_err=None, remove_outliers=True, pattern=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used read_err : bool if True, uncertainty data is also read (where available). If unspecified (None), then the default is used (cf. :attr:`READ_ERR`) pattern : str, optional string pattern for file search (cf :func:`get_file_list`) Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if read_err is None: read_err = self.READ_ERR if files is None: if len(self.files) == 0: self.get_file_list(vars_to_retrieve, pattern=pattern) files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() col_idx = data_obj.index meta_key = -1.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx #last_station_id = '' num_files = len(files) disp_each = int(num_files * 0.1) if disp_each < 1: disp_each = 1 VAR_IDX = -1 for i, _file in enumerate(files): if i % disp_each == 0: print("Reading file {} of {} ({})".format( i + 1, num_files, type(self).__name__)) try: stat = self.read_file(_file, vars_to_retrieve=vars_to_retrieve, read_err=read_err, remove_outliers=remove_outliers) if not any( [var in stat.vars_available for var in vars_to_retrieve]): self.logger.info("Station {} contains none of the desired " "variables. Skipping station...".format( stat.station_name)) continue #if last_station_id != station_id: meta_key += 1 # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(stat.get_meta()) for add_meta in self.KEEP_ADD_META: if add_meta in stat: metadata[meta_key][add_meta] = stat[add_meta] #metadata[meta_key]['station_id'] = station_id metadata[meta_key]['data_revision'] = self.data_revision metadata[meta_key]['variables'] = [] metadata[meta_key]['var_info'] = od() # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() #last_station_id = station_id # Is floating point single value time = stat.dtime[0] for var in stat.vars_available: if not var in data_obj.var_idx: VAR_IDX += 1 data_obj.var_idx[var] = VAR_IDX var_idx = data_obj.var_idx[var] val = stat[var] metadata[meta_key]['var_info'][var] = vi = od() if isinstance(val, VerticalProfile): altitude = val.altitude data = val.data add = len(data) err = val.data_err metadata[meta_key]['var_info']['altitude'] = via = od() vi.update(val.var_info[var]) via.update(val.var_info['altitude']) else: add = 1 altitude = np.nan data = val if var in stat.data_err: err = stat.err[var] else: err = np.nan vi.update(stat.var_info[var]) stop = idx + add #check if size of data object needs to be extended if stop >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(add) #write common meta info for this station data_obj._data[idx:stop, col_idx['latitude']] = stat['latitude'] data_obj._data[idx:stop, col_idx['longitude']] = stat['longitude'] data_obj._data[idx:stop, col_idx['altitude']] = stat['altitude'] data_obj._data[idx:stop, col_idx['meta']] = meta_key # write data to data object data_obj._data[idx:stop, col_idx['time']] = time data_obj._data[idx:stop, col_idx['stoptime']] = stat.stopdtime[0] data_obj._data[idx:stop, col_idx['data']] = data data_obj._data[idx:stop, col_idx['dataaltitude']] = altitude data_obj._data[idx:stop, col_idx['varidx']] = var_idx if read_err: data_obj._data[idx:stop, col_idx['dataerr']] = err if not var in meta_idx[meta_key]: meta_idx[meta_key][var] = [] meta_idx[meta_key][var].extend(list(range(idx, stop))) if not var in metadata[meta_key]['variables']: metadata[meta_key]['variables'].append(var) idx += add except Exception as e: self.read_failed.append(_file) self.logger.exception( 'Failed to read file {} (ERR: {})'.format( os.path.basename(_file), repr(e))) # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if files is None: if len(self.files) == 0: self.get_file_list() files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = 0.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx num_vars = len(vars_to_retrieve) num_files = len(files) disp_each = int(num_files * 0.1) if disp_each < 1: disp_each = 1 for i, _file in enumerate(files): if i % disp_each == 0: print_log.info("Reading file {} of {} ({})".format( i, num_files, type(self).__name__)) station_data = self.read_file(_file, vars_to_retrieve=vars_to_retrieve) # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(station_data.get_meta()) metadata[meta_key].update(station_data.get_station_coords()) metadata[meta_key]['dataset_name'] = self.DATASET_NAME metadata[meta_key]['ts_type'] = self.TS_TYPE metadata[meta_key]['variables'] = vars_to_retrieve if 'instrument_name' in station_data and station_data[ 'instrument_name'] is not None: instr = station_data['instrument_name'] else: instr = self.INSTRUMENT_NAME metadata[meta_key]['instrument_name'] = instr # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() num_times = len(station_data['dtime']) #access array containing time stamps # TODO: check using index instead (even though not a problem here # since all Aerocom data files are of type timeseries) times = np.float64(station_data['dtime']) totnum = num_times * num_vars #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(totnum) for var_idx, var in enumerate(vars_to_retrieve): values = station_data[var] start = idx + var_idx * num_times stop = start + num_times #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = station_data['stat_lat'] data_obj._data[start:stop, data_obj._LONINDEX] = station_data['stat_lat'] data_obj._data[ start:stop, data_obj._ALTITUDEINDEX] = station_data['stat_alt'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = times data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx idx += totnum meta_key = meta_key + 1. # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] data_obj.data_revision[self.DATASET_NAME] = self.data_revision self.data = data_obj return data_obj
def read_dataset(self, dataset_to_read, vars_to_retrieve=None, **kwargs): """Read dataset into an instance of :class:`ReadUngridded` Note ---- This method does not assign loaded data obj to class attribute :attr:`data` (only :func:`read` does) Parameters ---------- dataset_to_read : str name of dataset vars_to_retrieve : list list of variables to be retrieved. If None (default), the default variables of each reading routine are imported Returns -------- UngriddedData data object """ _caching = None if len(kwargs) > 0: _caching = const.CACHING const.CACHING = False print_log.info('Received additional reading constraints, ' 'ignoring caching') if vars_to_retrieve is None: # Note: self.vars_to_retrieve may be None as well, then # default variables of each network are read vars_to_retrieve = self.vars_to_retrieve reader = self.get_reader(dataset_to_read) if vars_to_retrieve is None: vars_to_retrieve = reader.PROVIDES_VARIABLES elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] # Since this interface enables to load multiple datasets, each of # which support a number of variables, here, only the variables are # considered that are supported by the dataset vars_available = [ var for var in vars_to_retrieve if var in reader.PROVIDES_VARIABLES ] cache = CacheHandlerUngridded(reader) if not self.ignore_cache: # initate cache handler for var in vars_available: try: cache.check_and_load(var_name=var) except: self.logger.exception( 'Fatal: compatibility error between ' 'old cache file {} and current version ' 'of code ') vars_to_read = [ v for v in vars_available if not v in cache.loaded_data ] data_read = None if len(vars_to_read) > 0: _loglevel = print_log.level print_log.setLevel(logging.INFO) data_read = reader.read(vars_to_read, **kwargs) print_log.setLevel(_loglevel) for var in vars_to_read: # write the cache file if not self.ignore_cache: try: cache.write(data_read, var) except Exception as e: _caching = False print_log.warning( 'Failed to write to cache directory. ' 'Error: {}. Deactivating caching in ' 'pyaerocom'.format(repr(e))) if len(vars_to_read) == len(vars_available): data_out = data_read else: data_out = UngriddedData() for var in vars_available: if var in cache.loaded_data: data_out.append(cache.loaded_data[var]) if data_read is not None: data_out.append(data_read) if _caching is not None: const.CACHING = _caching return data_out