示例#1
0
    def get_vars_supported(self, obs_id, vars_desired):
        """
        Filter input list of variables by supported ones for a certain data ID

        Parameters
        ----------
        obs_id : str
            ID of observation network
        vars_desired : list
            List of variables that are desired

        Returns
        -------
        list
            list of variables that can be read through the input network

        """
        obs_vars = []
        if isinstance(vars_desired, str):
            vars_desired = [vars_desired]
        if obs_id in self.post_compute:
            # check if all required are accessible
            postinfo = self.post_compute[obs_id]
            for var in varlist_aerocom(vars_desired):
                if not var in postinfo['vars_supported']:
                    continue
                requires = postinfo['aux_requires'][var]
                all_good = True
                for ds, vars_required in requires.items():
                    if isinstance(vars_required, str):
                        vars_required = [vars_required]
                    vars_avail = self.get_vars_supported(ds, vars_required)
                    if not len(vars_required) == len(vars_avail):
                        all_good = False
                        break
                if all_good:
                    obs_vars.append(var)

        else:
            # check if variable can be read from a dataset on disk
            _oreader = self.get_reader(obs_id)
            for var in varlist_aerocom(vars_desired):
                if _oreader.var_supported(var):
                    obs_vars.append(var)
        return obs_vars
示例#2
0
    def read(self, vars_to_retrieve=None, files=None, first_file=None,
             last_file=None, file_pattern=None, common_meta=None):
        """Method that reads list of files as instance of :class:`UngriddedData`

        Parameters
        ----------
        vars_to_retrieve : :obj:`list` or similar, optional,
            list containing variable IDs that are supposed to be read. If None,
            all variables in :attr:`PROVIDES_VARIABLES` are loaded
        files : :obj:`list`, optional
            list of files to be read. If None, then the file list is used that
            is returned on :func:`get_file_list`.
        first_file : :obj:`int`, optional
            index of first file in file list to read. If None, the very first
            file in the list is used. Note: is ignored if input parameter
            `file_pattern` is specified.
        last_file : :obj:`int`, optional
            index of last file in list to read. If None, the very last file
            in the list is used. Note: is ignored if input parameter
            `file_pattern` is specified.
        file_pattern : str, optional
            string pattern for file search (cf :func:`get_file_list`)
        common_meta : dict, optional
            dictionary that contains additional metadata shared for this
            network (assigned to each metadata block of the
            :class:`UngriddedData` object that is returned)

        Returns
        -------
        UngriddedData
            data object
        """
        if common_meta is None:
            common_meta = {}
        if vars_to_retrieve is None:
            vars_to_retrieve = self.DEFAULT_VARS
        elif isinstance(vars_to_retrieve, str):
            vars_to_retrieve = [vars_to_retrieve]
        vars_to_retrieve = varlist_aerocom(vars_to_retrieve)
        if files is None:
            if len(self.files) == 0:
                self.get_file_list(pattern=file_pattern)
            files = self.files

        if file_pattern is None:
            if first_file is None:
                first_file = 0
            if last_file is None:
                last_file = len(files)

            files = files[first_file:last_file]

        self.read_failed = []

        data_obj = UngriddedData()
        meta_key = 0.0
        idx = 0

        #assign metadata object
        metadata = data_obj.metadata
        meta_idx = data_obj.meta_idx

        num_vars = len(vars_to_retrieve)
        num_files = len(files)
        print_log.info('Reading AERONET data')
        for i in tqdm(range(num_files)):

            _file = files[i]
            station_data = self.read_file(_file,
                                          vars_to_retrieve=vars_to_retrieve)
            # Fill the metatdata dict
            # the location in the data set is time step dependant!
            # use the lat location here since we have to choose one location
            # in the time series plot
            meta = od()
            meta['var_info'] = od()
            meta.update(station_data.get_meta())
            #metadata[meta_key].update(station_data.get_station_coords())
            meta['data_id'] = self.data_id
            meta['ts_type'] = self.TS_TYPE
            #meta['variables'] = vars_to_retrieve
            if 'instrument_name' in station_data and station_data['instrument_name'] is not None:
                instr = station_data['instrument_name']
            else:
                instr = self.INSTRUMENT_NAME
            meta['instrument_name'] = instr
            meta['data_revision'] = self.data_revision
            meta['filename'] = _file

            meta.update(**common_meta)
            # this is a list with indices of this station for each variable
            # not sure yet, if we really need that or if it speeds up things
            meta_idx[meta_key] = od()

            num_times = len(station_data['dtime'])

            #access array containing time stamps
            # TODO: check using index instead (even though not a problem here
            # since all Aerocom data files are of type timeseries)
            times = np.float64(station_data['dtime'])

            totnum = num_times * num_vars

            #check if size of data object needs to be extended
            if (idx + totnum) >= data_obj._ROWNO:
                #if totnum < data_obj._CHUNKSIZE, then the latter is used
                data_obj.add_chunk(totnum)

            for var_idx, var in enumerate(vars_to_retrieve):
                values = station_data[var]
                start = idx + var_idx * num_times
                stop = start + num_times

                #write common meta info for this station (data lon, lat and
                #altitude are set to station locations)
                data_obj._data[start:stop,
                               data_obj._LATINDEX] = station_data['latitude']
                data_obj._data[start:stop,
                               data_obj._LONINDEX] = station_data['longitude']
                data_obj._data[start:stop,
                               data_obj._ALTITUDEINDEX] = station_data['altitude']
                data_obj._data[start:stop,
                               data_obj._METADATAKEYINDEX] = meta_key

                # write data to data object
                data_obj._data[start:stop, data_obj._TIMEINDEX] = times
                data_obj._data[start:stop, data_obj._DATAINDEX] = values
                data_obj._data[start:stop, data_obj._VARINDEX] = var_idx

                meta_idx[meta_key][var] = np.arange(start, stop)

                if var in station_data['var_info']:
                    if 'units' in station_data['var_info'][var]:
                        u = station_data['var_info'][var]['units']
                    elif 'unit' in station_data['var_info'][var]:
                        from pyaerocom.exceptions import MetaDataError
                        raise MetaDataError('Metadata attr unit is deprecated, '
                                            'please use units')
                    else:
                        u = self.DEFAULT_UNIT
                elif var in self.UNITS:
                    u = self.UNITS[var]
                else:
                    u = self.DEFAULT_UNIT
                meta['var_info'][var] = od(units=u)
                if not var in data_obj.var_idx:
                    data_obj.var_idx[var] = var_idx

            idx += totnum
            metadata[meta_key] = meta
            meta_key = meta_key + 1.

        # shorten data_obj._data to the right number of points
        data_obj._data = data_obj._data[:idx]
        #data_obj.data_revision[self.data_id] = self.data_revision
        self.data = data_obj
        return data_obj
示例#3
0
    def read(self, vars_to_retrieve=None, files=None, first_file=None,
             last_file=None, pattern=None, check_time=True, **kwargs):
        """Read data files into `UngriddedData` object

        Parameters
        ----------
        vars_to_retrieve : :obj:`list` or similar, optional,
            list containing variable IDs that are supposed to be read. If None,
            all variables in :attr:`PROVIDES_VARIABLES` are loaded
        files : :obj:`list`, optional
            list of files to be read. If None, then the file list is used that
            is returned on :func:`get_file_list`.
        first_file : :obj:`int`, optional
            index of first file in file list to read. If None, the very first
            file in the list is used
        last_file : :obj:`int`, optional
            index of last file in list to read. If None, the very last file
            in the list is used
         file_pattern : str, optional
            string pattern for file search (cf :func:`get_file_list`)

        Returns
        -------
        UngriddedData
            data object
        """
        if vars_to_retrieve is None:
            vars_to_retrieve = self.DEFAULT_VARS
        elif isinstance(vars_to_retrieve, str):
            vars_to_retrieve = [vars_to_retrieve]

        # make sure to use AeroCom variable names in output data
        vars_to_retrieve = varlist_aerocom(vars_to_retrieve)

        vars_to_read, vars_to_compute = self.check_vars_to_retrieve(vars_to_retrieve)

        if files is None:
            files = self.get_file_list(vars_to_read, pattern=pattern)
        elif isinstance(files, str):
            files = [files]

        if first_file is None:
            first_file = 0
        if last_file is None:
            last_file = len(files)

        files = files[first_file:last_file]

        data_obj = UngriddedData(num_points=1000000)

        meta_key = -1.0
        idx = 0

        #assign metadata object
        metadata = data_obj.metadata
        meta_idx = data_obj.meta_idx
        var_count_glob = -1
        rename = self.var_names_data_inv
        from tqdm import tqdm
        for i in tqdm(range(len(files))):
            _file = files[i]
            metafile = self.get_meta_filename(_file)
            var_to_read = metafile['var_name']
            begin = metafile['start']
            end = metafile['stop']

            var_read = rename[var_to_read]
            stats = self.read_file(_file, var_to_read=var_to_read,
                                   var_to_write=var_read, **kwargs)

            stats, added = self.compute_additional_vars(stats, vars_to_compute)
            if len(stats) == 0:
                const.logger.info('File {} does not contain any of the input '
                                  'variables {}'
                                  .format(_file, vars_to_retrieve))
            vars_avail = [var_read] + added
            vars_to_add = list(np.intersect1d(vars_to_retrieve, vars_avail))
            if len(vars_to_add) == 0:
                continue
            chunksize = 500000
            for stat in stats:
                meta_key += 1
                meta_idx[meta_key] = {}

                meta = stat['meta']
                vi = meta['var_info']

                meta['var_info'] = {}

                metadata[meta_key] = meta
                metadata[meta_key]['data_id'] = self.data_id
                # duplicate for now
                metadata[meta_key]['instrument_name'] = meta['measuring_instrument_name']
                statname = metadata[meta_key]['station_name']
                if '/' in statname:
                    statname = statname.replace('/','-')
                metadata[meta_key]['station_name'] = statname

                times = stat['time'].astype('datetime64[s]')
                timenums = np.float64(times)

                if check_time and (begin > times[0] or end < times[-1]):
                    raise ValueError('Something seems to be off with time '
                                     'dimension...')

                num_vars = len(vars_to_add)
                num_times = len(times)

                totnum = num_times * num_vars

                #check if size of data object needs to be extended
                if (idx + totnum) >= data_obj._ROWNO:
                    #if totnum < data_obj._CHUNKSIZE, then the latter is used
                    data_obj.add_chunk(chunksize)

                for j, var_to_write in enumerate(vars_to_add):
                    values = stat[var_to_write]

                    start = idx + j*num_times
                    stop = start + num_times

                    if not var_to_write in data_obj.var_idx:
                        var_count_glob += 1
                        var_idx = var_count_glob
                        data_obj.var_idx[var_to_write] = var_idx
                    else:
                        var_idx = data_obj.var_idx[var_to_write]

                    meta['var_info'][var_to_write] = vi[var_to_write]
                    #write common meta info for this station (data lon, lat and
                    #altitude are set to station locations)
                    data_obj._data[start:stop,
                                   data_obj._LATINDEX] = meta['latitude']
                    data_obj._data[start:stop,
                                   data_obj._LONINDEX] = meta['longitude']
                    data_obj._data[start:stop,
                                   data_obj._ALTITUDEINDEX] = meta['altitude']
                    data_obj._data[start:stop,
                                   data_obj._METADATAKEYINDEX] = meta_key

                    # write data to data object
                    data_obj._data[start:stop, data_obj._TIMEINDEX] = timenums

                    data_obj._data[start:stop, data_obj._DATAINDEX] = values

                    # add invalid measurements
                    invalid = stat['data_flagged'][var_to_write]
                    data_obj._data[start:stop, data_obj._DATAFLAGINDEX] = invalid

                    data_obj._data[start:stop, data_obj._VARINDEX] = var_idx

                    meta_idx[meta_key][var_to_write] = np.arange(start, stop)

                idx += totnum

        data_obj._data = data_obj._data[:idx]
        data_obj._check_index()
        return data_obj
示例#4
0
    def read_dataset(self,
                     dataset_to_read,
                     vars_to_retrieve=None,
                     only_cached=False,
                     **kwargs):
        """Read dataset into an instance of :class:`ReadUngridded`

        Parameters
        ----------
        dataset_to_read : str
            name of dataset
        vars_to_retrieve : str or list
            variable or list of variables to be imported
        only_cached : bool
            if True, then nothing is reloaded but only data is loaded that is
            available as cached objects (not recommended to use but may be
            used if working offline without connection to database)
        **kwargs
            additional reading constraints. If any are provided, caching is
            deactivated and the data will be read from disk.

        Returns
        --------
        UngriddedData
            data object
        """
        _caching = None
        if len(kwargs) > 0:
            _caching = const.CACHING
            const.CACHING = False

            print_log.info('Received additional reading constraints, '
                           'ignoring caching')

        reader = self.get_reader(dataset_to_read)

        if vars_to_retrieve is not None:
            # Note: self.vars_to_retrieve may be None as well, then
            # default variables of each network are read
            self.vars_to_retrieve = vars_to_retrieve

        if self.vars_to_retrieve is None:
            self.vars_to_retrieve = reader.PROVIDES_VARIABLES

        vars_to_retrieve = varlist_aerocom(self.vars_to_retrieve)

        # data_dir will be None in most cases, but can be specified when
        # creating the instance, by default, data_dir is inferred automatically
        # in the reading class, using database location
        data_dir = self._get_data_dir(dataset_to_read)
        if data_dir is not None:
            if not os.path.exists(data_dir):
                raise FileNotFoundError(
                    'Trying to read {} from specified data_dir {} failed. '
                    'Directory does not exist'.format(dataset_to_read,
                                                      data_dir))
            reader._dataset_path = data_dir
            const.print_log.info(
                'Reading {} from specified data loaction: {}'.format(
                    dataset_to_read, data_dir))

        # Since this interface enables to load multiple datasets, each of
        # which support a number of variables, here, only the variables are
        # considered that are supported by the dataset
        vars_available = [
            var for var in vars_to_retrieve if reader.var_supported(var)
        ]
        if len(vars_available) == 0:
            raise DataRetrievalError('None of the input variables ({}) is '
                                     'supported by {} interface'.format(
                                         vars_to_retrieve, dataset_to_read))
        cache = CacheHandlerUngridded(reader)
        if not self.ignore_cache:
            # initate cache handler
            for var in vars_available:
                try:
                    cache.check_and_load(var, force_use_outdated=only_cached)
                except Exception:
                    self.logger.exception(
                        'Fatal: compatibility error between '
                        'old cache file {} and current version '
                        'of code ')

        if not only_cached:
            vars_to_read = [
                v for v in vars_available if not v in cache.loaded_data
            ]
        else:
            vars_to_read = []

        data_read = None
        if len(vars_to_read) > 0:

            _loglevel = print_log.level
            print_log.setLevel(logging.INFO)
            data_read = reader.read(vars_to_read, **kwargs)
            print_log.setLevel(_loglevel)

            for var in vars_to_read:
                # write the cache file
                if not self.ignore_cache:
                    try:
                        cache.write(data_read, var)
                    except Exception as e:
                        _caching = False
                        print_log.warning(
                            'Failed to write to cache directory. '
                            'Error: {}. Deactivating caching in '
                            'pyaerocom'.format(repr(e)))

        if len(vars_to_read) == len(vars_available):
            data_out = data_read
        else:
            data_out = UngriddedData()
            for var in vars_available:
                if var in cache.loaded_data:
                    data_out.append(cache.loaded_data[var])
            if data_read is not None:
                data_out.append(data_read)

        if _caching is not None:
            const.CACHING = _caching
        return data_out
示例#5
0
    def check_vars_to_retrieve(self, vars_to_retrieve):
        """Separate variables that are in file from those that are computed

        Some of the provided variables by this interface are not included in
        the data files but are computed within this class during data import
        (e.g. od550aer, ang4487aer).

        The latter may require additional parameters to be retrieved from the
        file, which is specified in the class header (cf. attribute
        ``AUX_REQUIRES``).

        This function checks the input list that specifies all required
        variables and separates them into two lists, one that includes all
        variables that can be read from the files and a second list that
        specifies all variables that are computed in this class.

        Parameters
        ----------
        vars_to_retrieve : list
            all parameter names that are supposed to be loaded

        Returns
        -------
        tuple
            2-element tuple, containing

            - list: list containing all variables to be read
            - list: list containing all variables to be computed
        """
        if vars_to_retrieve is None:
            vars_to_retrieve = self.DEFAULT_VARS
        elif isinstance(vars_to_retrieve, str):
            vars_to_retrieve = [vars_to_retrieve]
        # first, check if input variables are alias names, and replace
        vars_to_retrieve = varlist_aerocom(vars_to_retrieve)

        repeat = True
        while repeat:
            repeat, vars_to_retrieve = self._add_additional_vars(
                vars_to_retrieve)

        # unique list containing all variables that are supposed to be read,
        # either because they are required to be retrieved, or because they
        # are supposed to be read because they are required to compute one
        # of the output variables
        vars_to_retrieve = list(dict.fromkeys(vars_to_retrieve))

        # in the following, vars_to_retrieve is separated into two arrays, one
        # containing all variables that can be read from the files, and the
        # second containing all variables that are computed
        vars_to_read = []
        vars_to_compute = []

        for var in vars_to_retrieve:
            if not var in self.PROVIDES_VARIABLES:
                raise ValueError("Invalid variable {}".format(var))
            elif var in self.AUX_REQUIRES:
                vars_to_compute.append(var)
            else:
                vars_to_read.append(var)
        return (vars_to_read, vars_to_compute)