Exemplo n.º 1
0
def test_init_shape():
    npt.assert_array_equal(UngriddedData().shape, (10000, 12))
    
    d1 = UngriddedData(num_points=2, add_cols=['bla', 'blub'])
    npt.assert_array_equal(d1.shape, (2, 14))
    
    d1.add_chunk(1112)
    
    npt.assert_array_equal(d1.shape, (1114, 14))
Exemplo n.º 2
0
def test_coordinate_access():
    import string
    d = UngriddedData()

    stat_names = list(string.ascii_lowercase)
    lons = np.arange(len(stat_names))
    lats = np.arange(len(stat_names)) - 90
    alts = np.arange(len(stat_names)) * 13

    for i, n in enumerate(stat_names):
        d.metadata[i] = dict(data_id='testcase',
                             station_name=n,
                             latitude=lats[i],
                             longitude=lons[i],
                             altitude=alts[i])

    import numpy.testing as npt

    npt.assert_array_equal(d.station_name, stat_names)
    npt.assert_array_equal(d.latitude, lats)
    npt.assert_array_equal(d.longitude, lons)
    npt.assert_array_equal(d.altitude, alts)

    case_ok = False
    try:
        d.to_station_data('a')
    except DataCoverageError:
        case_ok = True

    assert case_ok

    c = d.station_coordinates
    npt.assert_array_equal(c['station_name'], stat_names)
    npt.assert_array_equal(c['latitude'], lats)
    npt.assert_array_equal(c['longitude'], lons)
    npt.assert_array_equal(c['altitude'], alts)
Exemplo n.º 3
0
    def read(self, vars_to_retrieve=None, first_file=None, last_file=None):

        if isinstance(vars_to_retrieve, str):
            vars_to_retrieve = [vars_to_retrieve]

        files = self.get_file_list()
        if first_file is None:
            first_file = 0
        if last_file is None:
            last_file = len(files)
        files = files[first_file:last_file]

        stats = self._read_files(files, vars_to_retrieve)

        data = UngriddedData.from_station_data(stats)

        return data
Exemplo n.º 4
0
 def read(self, vars_to_retrieve=None, files=None, first_file=None, 
          last_file=None, read_err=None, remove_outliers=True,
          file_pattern=None):
     """Method that reads list of files as instance of :class:`UngriddedData`
     
     Parameters
     ----------
     vars_to_retrieve : :obj:`list` or similar, optional,
         list containing variable IDs that are supposed to be read. If None, 
         all variables in :attr:`PROVIDES_VARIABLES` are loaded
     files : :obj:`list`, optional
         list of files to be read. If None, then the file list is used that
         is returned on :func:`get_file_list`.
     first_file : :obj:`int`, optional
         index of first file in file list to read. If None, the very first
         file in the list is used
     last_file : :obj:`int`, optional
         index of last file in list to read. If None, the very last file 
         in the list is used
     read_err : bool
         if True, uncertainty data is also read (where available). If 
         unspecified (None), then the default is used (cf. :attr:`READ_ERR`)
      file_pattern : str, optional
         string pattern for file search (cf :func:`get_file_list`)
         
     Returns
     -------
     UngriddedData
         data object
     """
     if vars_to_retrieve is None:
         vars_to_retrieve = self.DEFAULT_VARS
     elif isinstance(vars_to_retrieve, str):
         vars_to_retrieve = [vars_to_retrieve]
     if read_err is None:
         read_err = self.READ_ERR
         
     if files is None:
         if len(self.files) == 0:
             self.get_file_list(vars_to_retrieve, file_pattern=file_pattern)
         files = self.files
 
     if first_file is None:
         first_file = 0
     if last_file is None:
         last_file = len(files)
     
     files = files[first_file:last_file]
     
     self.read_failed = []
     
     data_obj = UngriddedData()
     col_idx = data_obj.index
     meta_key = -1.0
     idx = 0
     
     #assign metadata object
     metadata = data_obj.metadata
     meta_idx = data_obj.meta_idx
     
     #last_station_id = ''
     num_files = len(files)
     
     disp_each = int(num_files*0.1)
     if disp_each < 1:
         disp_each = 1
     
     VAR_IDX = -1
     for i, _file in enumerate(files):
         if i%disp_each == 0:
             print("Reading file {} of {} ({})".format(i+1, 
                              num_files, type(self).__name__))
         try:
             stat = self.read_file(_file, 
                                   vars_to_retrieve=vars_to_retrieve,
                                   read_err=read_err, 
                                   remove_outliers=remove_outliers)
             if not any([var in stat.contains_vars for var in 
                         vars_to_retrieve]):
                 self.logger.info("Station {} contains none of the desired "
                                  "variables. Skipping station..."
                                  .format(stat.station_name))
                 continue
             #if last_station_id != station_id:
             meta_key += 1
             # Fill the metatdata dict
             # the location in the data set is time step dependant!
             # use the lat location here since we have to choose one location
             # in the time series plot
             metadata[meta_key] = od()
             metadata[meta_key].update(stat.get_meta())
             for add_meta in self.KEEP_ADD_META:
                 if add_meta in stat:
                     metadata[meta_key][add_meta] = stat[add_meta]
             #metadata[meta_key]['station_id'] = station_id
             #metadata[meta_key]['data_id'] = self.DATA_ID
             metadata[meta_key]['data_revision'] = self.data_revision
             metadata[meta_key]['variables'] = []
             metadata[meta_key]['var_info'] = od()
             # this is a list with indices of this station for each variable
             # not sure yet, if we really need that or if it speeds up things
             meta_idx[meta_key] = od()
                 #last_station_id = station_id
             
             # Is floating point single value
             time = stat.dtime[0]
             for var in stat.contains_vars:
                 if not var in data_obj.var_idx:
                     VAR_IDX +=1
                     data_obj.var_idx[var] = VAR_IDX
                 
                 var_idx = data_obj.var_idx[var]
                     
                 val = stat[var]
                 metadata[meta_key]['var_info'][var] = vi = od()
                 if isinstance(val, VerticalProfile):
                     altitude = val.altitude
                     data = val.data
                     add = len(data)
                     err = val.data_err
                     metadata[meta_key]['var_info']['altitude'] = via = od()
                     
                     vi.update(val.var_info[var])
                     via.update(val.var_info['altitude'])
                 else:
                     add = 1
                     altitude = np.nan
                     data = val
                     if var in stat.data_err:
                         err = stat.err[var]
                     else:
                         err = np.nan
                 vi.update(stat.var_info[var])   
                 stop = idx + add
                 #check if size of data object needs to be extended
                 if stop >= data_obj._ROWNO:
                     #if totnum < data_obj._CHUNKSIZE, then the latter is used
                     data_obj.add_chunk(add)
                 
                 #write common meta info for this station
                 data_obj._data[idx:stop, 
                                col_idx['latitude']] = stat['latitude']
                 data_obj._data[idx:stop, 
                                col_idx['longitude']] = stat['longitude']
                 data_obj._data[idx:stop, 
                                col_idx['altitude']] = stat['altitude']
                 data_obj._data[idx:stop, 
                                col_idx['meta']] = meta_key
                                
                 # write data to data object
                 data_obj._data[idx:stop, col_idx['time']] = time
                 data_obj._data[idx:stop, col_idx['stoptime']] = stat.stopdtime[0]
                 data_obj._data[idx:stop, col_idx['data']] = data
                 data_obj._data[idx:stop, col_idx['dataaltitude']] = altitude
                 data_obj._data[idx:stop, col_idx['varidx']] = var_idx
                 
                 if read_err:
                     data_obj._data[idx:stop, col_idx['dataerr']] = err
                 
                 if not var in meta_idx[meta_key]:
                     meta_idx[meta_key][var] = []
                 meta_idx[meta_key][var].extend(list(range(idx, stop)))
                 
                 if not var in metadata[meta_key]['variables']:
                     metadata[meta_key]['variables'].append(var)
                 
                 idx += add
                 
         except Exception as e:
             self.read_failed.append(_file)
             self.logger.exception('Failed to read file {} (ERR: {})'
                                   .format(os.path.basename(_file),
                                           repr(e)))
             
     # shorten data_obj._data to the right number of points
     data_obj._data = data_obj._data[:idx]
     #data_obj.data_revision[self.DATA_ID] = self.data_revision
     self.data = data_obj
     return data_obj
Exemplo n.º 5
0
    def read(self,
             vars_to_retrieve=None,
             files=None,
             first_file=None,
             last_file=None):
        """Method that reads list of files as instance of :class:`UngriddedData`
        
        Parameters
        ----------
        vars_to_retrieve : :obj:`list` or similar, optional,
            list containing variable IDs that are supposed to be read. If None, 
            all variables in :attr:`PROVIDES_VARIABLES` are loaded
        files : :obj:`list`, optional
            list of files to be read. If None, then the file list is used that
            is returned on :func:`get_file_list`.
        first_file : :obj:`int`, optional
            index of first file in file list to read. If None, the very first
            file in the list is used
        last_file : :obj:`int`, optional
            index of last file in list to read. If None, the very last file 
            in the list is used
            
        Returns
        -------
        UngriddedData
            data object
        """
        if vars_to_retrieve is None:
            vars_to_retrieve = self.DEFAULT_VARS
        elif isinstance(vars_to_retrieve, str):
            vars_to_retrieve = [vars_to_retrieve]

        if files is None:
            if len(self.files) == 0:
                self.get_file_list()
            files = self.files

        if first_file is None:
            first_file = 0
        if last_file is None:
            last_file = len(files)

        files = files[first_file:last_file]

        self.read_failed = []

        data_obj = UngriddedData()
        meta_key = -1.0
        idx = 0

        #assign metadata object
        metadata = data_obj.metadata
        meta_idx = data_obj.meta_idx

        last_stat_code = ''
        num_files = len(files)
        for i, _file in enumerate(files):
            self.logger.info('File {} ({})'.format(i, num_files))
            try:
                station_data = self.read_file(
                    _file, vars_to_retrieve=vars_to_retrieve)
                if not any([
                        var in station_data.contains_vars
                        for var in vars_to_retrieve
                ]):
                    self.logger.info("Station {} contains none of the desired "
                                     "variables. Skipping station...".format(
                                         station_data.station_name))
                    continue
                stat_code = station_data['stat_code']
                if last_stat_code != stat_code:
                    meta_key += 1
                    # Fill the metatdata dict
                    # the location in the data set is time step dependant!
                    # use the lat location here since we have to choose one location
                    # in the time series plot
                    metadata[meta_key] = od()
                    metadata[meta_key].update(station_data.get_meta())
                    metadata[meta_key].update(
                        station_data.get_station_coords())
                    metadata[meta_key]['dataset_name'] = self.DATASET_NAME
                    metadata[meta_key]['variables'] = []
                    # this is a list with indices of this station for each variable
                    # not sure yet, if we really need that or if it speeds up things
                    meta_idx[meta_key] = od()
                    last_stat_code = stat_code

                # Is floating point single value
                time = station_data.dtime
                for var_idx, var in enumerate(station_data.contains_vars):
                    val = station_data[var]
                    if isinstance(val, VerticalProfile):
                        add = len(val)
                        altitude = val.altitude
                        data = val.data
                    else:
                        add = 1
                        altitude = np.nan
                        data = val
                    stop = idx + add
                    #check if size of data object needs to be extended
                    if stop >= data_obj._ROWNO:
                        #if totnum < data_obj._CHUNKSIZE, then the latter is used
                        data_obj.add_chunk(add)

                    #write common meta info for this station
                    data_obj._data[
                        idx:stop,
                        data_obj._LATINDEX] = station_data['latitude']
                    data_obj._data[
                        idx:stop,
                        data_obj._LONINDEX] = station_data['longitude']
                    data_obj._data[
                        idx:stop,
                        data_obj._ALTITUDEINDEX] = station_data['altitude']
                    data_obj._data[idx:stop,
                                   data_obj._METADATAKEYINDEX] = meta_key

                    # write data to data object
                    data_obj._data[idx:stop, data_obj._TIMEINDEX] = time
                    data_obj._data[idx:stop, data_obj._DATAINDEX] = data
                    data_obj._data[idx:stop,
                                   data_obj._DATAHEIGHTINDEX] = altitude
                    data_obj._data[idx:stop, data_obj._VARINDEX] = var_idx

                    if not var in meta_idx[meta_key]:
                        meta_idx[meta_key][var] = []
                    meta_idx[meta_key][var].extend(list(range(idx, stop)))

                    if not var in metadata[meta_key]['variables']:
                        metadata[meta_key]['variables'].append(var)
                    if not var in data_obj.var_idx:
                        data_obj.var_idx[var] = var_idx
                    idx += add
            except:
                self.read_failed.append(_file)
                self.logger.exception('Failed to read file {}'.format(
                    os.path.basename(_file)))

        # shorten data_obj._data to the right number of points
        data_obj._data = data_obj._data[:idx]
        data_obj.data_revision[self.DATASET_NAME] = self.data_revision
        self.data = data_obj
        return data_obj
Exemplo n.º 6
0
    def read(self, vars_to_retrieve=None, first_file=None, last_file=None):
        """Method that reads list of files as instance of :class:`UngriddedData`
        
        Parameters
        ----------
        vars_to_retrieve : :obj:`list` or similar, optional,
            list containing variable IDs that are supposed to be read. If None, 
            all variables in :attr:`PROVIDES_VARIABLES` are loaded
        first_file : :obj:`int`, optional
            index of first file in file list to read. If None, the very first
            file in the list is used
        last_file : :obj:`int`, optional
            index of last file in list to read. If None, the very last file 
            in the list is used
            
        Returns
        -------
        UngriddedData
            data object
        """
        if vars_to_retrieve is None:
            vars_to_retrieve = self.DEFAULT_VARS
        elif isinstance(vars_to_retrieve, str):
            vars_to_retrieve = [vars_to_retrieve]

        if len(self.files) == 0:
            self.get_file_list(vars_to_retrieve)
        files = self.files

        if first_file is None:
            first_file = 0
        if last_file is None:
            last_file = len(files)

        files = files[first_file:last_file]
        files_contain = self.files_contain[first_file:last_file]
        self.read_failed = []

        data_obj = UngriddedData()
        meta_key = 0.0
        idx = 0

        #assign metadata object
        metadata = data_obj.metadata
        meta_idx = data_obj.meta_idx

        num_files = len(files)
        disp_each = int(num_files * 0.1)
        if disp_each < 1:
            disp_each = 1

        vars_to_read, vars_to_compute = self.check_vars_to_retrieve(
            vars_to_retrieve)

        self.files_failed = []
        for i, _file in enumerate(files):
            if i % disp_each == 0:
                print("Reading file {} of {} ({})".format(
                    i, num_files,
                    type(self).__name__))
            vars_to_read = files_contain[i]

            try:
                station_data = self.read_file(_file,
                                              _vars_to_read=vars_to_read,
                                              _vars_to_compute=vars_to_compute)
            except (NotInFileError, EbasFileError) as e:
                self.files_failed.append(_file)
                self.logger.warning('Failed to read file {}. '
                                    'Error: {}'.format(os.path.basename(_file),
                                                       repr(e)))
                continue

            # Fill the metatdata dict
            # the location in the data set is time step dependent!
            # use the lat location here since we have to choose one location
            # in the time series plot
            metadata[meta_key] = od()
            metadata[meta_key].update(station_data.get_meta())
            metadata[meta_key].update(station_data.get_station_coords())
            metadata[meta_key]['dataset_name'] = self.DATASET_NAME
            metadata[meta_key]['ts_type'] = station_data['ts_type']
            metadata[meta_key]['instrument_name'] = station_data[
                'instrument_name']
            metadata[meta_key]['var_info'] = od()
            # this is a list with indices of this station for each variable
            # not sure yet, if we really need that or if it speeds up things
            meta_idx[meta_key] = {}

            num_times = len(station_data['dtime'])

            #access array containing time stamps
            # TODO: check using index instead (even though not a problem here
            # since all Aerocom data files are of type timeseries)
            times = np.float64(station_data['dtime'])

            totnum = num_times * len(station_data.contains_vars)

            #check if size of data object needs to be extended
            if (idx + totnum) >= data_obj._ROWNO:
                #if totnum < data_obj._CHUNKSIZE, then the latter is used
                data_obj.add_chunk(totnum)

            vars_avail = station_data.contains_vars
            for var_idx, var in enumerate(vars_avail):
                if not var in data_obj.unit:
                    data_obj.unit[var] = station_data.unit[var]
                elif station_data.unit[var] != data_obj.unit[var]:
                    raise DataUnitError("Unit mismatch")
                values = station_data[var]
                start = idx + var_idx * num_times
                stop = start + num_times

                #write common meta info for this station (data lon, lat and
                #altitude are set to station locations)
                data_obj._data[start:stop,
                               data_obj._LATINDEX] = station_data['stat_lat']
                data_obj._data[start:stop,
                               data_obj._LONINDEX] = station_data['stat_lat']
                data_obj._data[
                    start:stop,
                    data_obj._ALTITUDEINDEX] = station_data['stat_alt']
                data_obj._data[start:stop,
                               data_obj._METADATAKEYINDEX] = meta_key

                # write data to data object
                data_obj._data[start:stop, data_obj._TIMEINDEX] = times
                data_obj._data[start:stop, data_obj._DATAINDEX] = values
                data_obj._data[start:stop, data_obj._VARINDEX] = var_idx

                meta_idx[meta_key][var] = np.arange(start, stop)

                var_info = station_data['var_info'][var]
                metadata[meta_key]['var_info'][var] = var_info.to_dict()
                if not var in data_obj.var_idx:
                    data_obj.var_idx[var] = var_idx
            metadata[meta_key]['variables'] = vars_avail
            idx += totnum
            meta_key = meta_key + 1.

        # shorten data_obj._data to the right number of points
        data_obj._data = data_obj._data[:idx]
        data_obj = data_obj.merge_common_meta()
        data_obj.data_revision[self.DATASET_NAME] = self.data_revision
        self.data = data_obj
        return data_obj
Exemplo n.º 7
0
def test_init_add_cols():
    d1 = UngriddedData(num_points=2, add_cols=['bla', 'blub'])
    npt.assert_array_equal(d1.shape, (2, 14))
Exemplo n.º 8
0
def ungridded_empty():
    return UngriddedData()
Exemplo n.º 9
0
def test_from_cache(aeronetsunv3lev2_subset, tempdir):
    reloaded = UngriddedData.from_cache(
        data_dir=tempdir, file_name='ungridded_aeronet_subset.pkl')

    assert reloaded.shape == aeronetsunv3lev2_subset.shape
Exemplo n.º 10
0
def _make_ungridded_data():
    data = UngriddedData()
    # Add some random data and metadata blocks
    return data
Exemplo n.º 11
0
    def read(self, vars_to_retrieve):
        # Leeres Datenobjekt anlegen
        data_obj = UngriddedData()

        # date index pointer in numpy array
        index_pointer = 0

        # metadata key pointer for each file
        meta_key = 0.0

        # ist in der Basisklasse implementiert, kann aber auch ueberschrieben werden, falls noetig
        files = self.get_file_list()
        for f in files:
            # load data from individual file (returns e.g. dictionary, or StationData)
            file_data = self.read_file()
            for var_idx, var in enumerate(vars_to_retrieve):
                # add station / file metadata e.g.
                data_obj.metadata[meta_key]['longitude'] = file_data[
                    'longitude']
                data_obj.metadata[meta_key]['latitude'] = file_data['latitude']
                data_obj.metadata[meta_key]['altitude'] = file_data['altitude']

                # now copy all data columns

                # time stamps, assuming array or list of numpy.datetime64 objects
                time_stamps = file_data['dtime']

                # the actual data for this variable
                var_data = file_data[var]

                # the number of datapoints added to the Ungridded data object
                add_num = len(var_data)
                stop_idx = index_pointer + add_num
                if stop_idx >= data_obj._ROWNO:
                    # add_chunk actually adds a minimum of 1000 datapoints, it only uses add_num if add_num >= 1000
                    data_obj.add_chunk(add_num)

                # now you can add the variable to the data numpy array
                data_obj._data[index_pointer:stop_idx,
                               data_obj._LATINDEX] = file_data['latitude']
                data_obj._data[index_pointer:stop_idx,
                               data_obj._LONINDEX] = file_data['longitude']
                data_obj._data[index_pointer:stop_idx,
                               data_obj._ALTITUDEINDEX] = file_data['altitude']
                data_obj._data[index_pointer:stop_idx,
                               data_obj._METADATAKEYINDEX] = meta_key

                # write data to data object
                data_obj._data[index_pointer:stop_idx,
                               data_obj._TIMEINDEX] = np.float64(time_stamps)
                data_obj._data[index_pointer:stop_idx,
                               data_obj._DATAINDEX] = var_data
                data_obj._data[index_pointer:stop_idx,
                               data_obj._DATAHEIGHTINDEX] = file_data[
                                   'altitude']  #or data
                data_obj._data[index_pointer:stop_idx,
                               data_obj._VARINDEX] = var_idx

                index_pointer += add_num

            meta_key += 1.