def get_metadata_header_values(self, elements=None): """ Get metadata file in the format called Header Values. Parameters ---------- elements : dict, optional (default: None) Previously loaded elements can be passed here to avoid reading the file again. Returns ------- metadata : MetaData Metadata information. depth : Depth Sensor Depth, generated from file name """ if elements: headr = elements["headr"] scnd = elements["scnd"] last = elements["last"] fname = elements["fname"] else: headr, scnd, last, fname = self.get_elements_from_file() if len(fname) > 9: instrument = "_".join(fname[6:len(fname) - 2]) else: instrument = fname[6] if fname[3] in const.VARIABLE_LUT: variable = const.VARIABLE_LUT[fname[3]] else: variable = fname[3] timerange_from = pd.to_datetime(" ".join(scnd[:2])) timerange_to = pd.to_datetime(" ".join(last[:2])) depth = Depth(float(headr[6]), float(headr[7])) metadata = MetaData([ MetaVar("network", headr[1]), MetaVar("station", headr[2]), MetaVar("variable", variable, depth), MetaVar("instrument", instrument, depth), MetaVar("timerange_from", timerange_from), MetaVar("timerange_to", timerange_to), MetaVar("latitude", float(headr[3])), MetaVar("longitude", float(headr[4])), MetaVar("elevation", float(headr[5])), ]) return metadata, depth
def test_MetaVar(self): var = MetaVar('myvar', 1.1, Depth(0, 1)) assert str(var) == "myvar (0.0 to 1.0 [m]): 1.1" assert tuple(var) == ('myvar', 1.1, 0, 1) assert var == var nvar = MetaVar('negmyvar', 1.1, Depth(0, -1)) assert str(nvar) == "negmyvar (0.0 to -1.0 [m]): 1.1" assert tuple(nvar) == ('negmyvar', 1.1, -0, -1) assert nvar != var other = MetaVar('other', 99) assert str(other) == "other (no depth): 99" assert tuple(other) == ('other', 99, None, None) assert other != var
def get_metadata_ceop_sep(self, elements=None): """ Get metadata in the file format called CEOP in separate files. Parameters ---------- elements : dict, optional (default: None) Previously loaded elements can be passed here to avoid reading the file again. Returns ------- metadata : MetaData Metadata information. depth : Depth Sensor Depth, generated from file name """ if elements: headr = elements['headr'] last = elements['last'] fname = elements['fname'] else: headr, _, last, fname = self.get_elements_from_file() if len(fname) > 9: instr = '_'.join(fname[6:len(fname) - 2]) else: instr = fname[6] if fname[3] in const.VARIABLE_LUT: variable = const.VARIABLE_LUT[fname[3]] else: variable = fname[3] timerange_from = pd.to_datetime(' '.join(headr[:2])) timerange_to = pd.to_datetime(' '.join(last[:2])) depth = Depth(float(fname[4]), float(fname[5])) metadata = MetaData([ MetaVar('network', fname[1]), MetaVar('station', fname[2]), MetaVar('variable', variable, depth), MetaVar('instrument', instr, depth), MetaVar('timerange_from', timerange_from), MetaVar('timerange_to', timerange_to), MetaVar('latitude', float(headr[7])), MetaVar('longitude', float(headr[8])), MetaVar('elevation', float(headr[9])), ]) return metadata, depth
def test_MetaData(self): assert len(self.dat) == 4 assert 'second' in self.dat assert self.dat[1] in self.dat assert tuple(self.dat[0]) == ('first', 1, 0., 1.) assert tuple(self.dat[1]) == ('second', 0, None, None) assert self.dat['dup'] == self.dat[3] assert self.dat.keys() == ['first', 'second', 'neg', 'dup'] assert MetaData([MetaVar.from_tuple( ('first', 1, 0., 1.))]) == self.dat[['first']]
def test_MetaData(self): assert len(self.dat) == 4 assert "second" in self.dat assert self.dat[1] in self.dat assert tuple(self.dat[0]) == ("first", 1, 0.0, 1.0) assert tuple(self.dat[1]) == ("second", 0, None, None) assert self.dat["dup"] == self.dat[3] assert self.dat.keys() == ["first", "second", "neg", "dup"] assert (MetaData([MetaVar.from_tuple( ("first", 1, 0.0, 1.0))]) == self.dat[["first"]])
def setUp(self) -> None: vars = [ MetaVar('first', 1, Depth(0, 1)), MetaVar('second', 0), MetaVar('neg', -99, Depth(-0.25, -1)), MetaVar('dup', '3rd', Depth(1, 3)) ] self.dat = MetaData(vars) self.other = MetaData( [MetaVar('dup', '3rd', Depth(2, 4)), MetaVar('4', 4)])
def setUp(self) -> None: vars = [ MetaVar("first", 1, Depth(0, 1)), MetaVar("second", 0), MetaVar("neg", -99, Depth(-0.25, -1)), MetaVar("dup", "3rd", Depth(1, 3)), ] self.dat = MetaData(vars) self.other = MetaData( [MetaVar("dup", "3rd", Depth(2, 4)), MetaVar("4", 4)])
def __read_field(data, fieldname: str, new_name=None) -> np.array: """ Extract a field from the loaded csv metadata """ field_vars = [] if fieldname in data.index: froms = np.atleast_1d(data.loc[fieldname]['depth_from[m]']) tos = np.atleast_1d(data.loc[fieldname]['depth_to[m]']) vals = np.atleast_1d(data.loc[fieldname]['value']) for d_from, d_to, val in zip(froms, tos, vals): d = Depth(d_from, d_to) name = new_name if new_name is not None else fieldname try: val = float(val) except ValueError: pass # value is actually a string, that's ok field_vars.append(MetaVar(name, val, d)) return field_vars
def read_metadata(self) -> MetaData: """ Read csv file containing static variables into data frame. Returns ------- metadata : MetaData Static metadata read from csv file. """ if self.root.zip: if not self.root.isopen: self.root.open() with TemporaryDirectory(prefix="ismn", dir=self.temp_root) as tempdir: extracted = self.root.extract_file(self.file_path, tempdir) data = self.__read_csv(extracted) else: data = self.__read_csv(self.root.path / self.file_path) # read landcover classifications lc = data.loc[["land cover classification" ]][["value", "quantity_source_name"]] lc_dict = { "CCI_landcover_2000": const.CSV_META_TEMPLATE["lc_2000"], "CCI_landcover_2005": const.CSV_META_TEMPLATE["lc_2005"], "CCI_landcover_2010": const.CSV_META_TEMPLATE["lc_2010"], "insitu": const.CSV_META_TEMPLATE["lc_insitu"], } cl_dict = { "koeppen_geiger_2007": const.CSV_META_TEMPLATE["climate_KG"], "insitu": const.CSV_META_TEMPLATE["climate_insitu"], } for key in lc_dict.keys(): if key in lc["quantity_source_name"].values: if key != "insitu": lc_dict[key] = np.int(lc.loc[lc["quantity_source_name"] == key]["value"].values[0]) else: lc_dict[key] = lc.loc[lc["quantity_source_name"] == key]["value"].values[0] logging.info( f"insitu land cover classification available: {self.file_path}" ) # read climate classifications try: cl = data.loc[["climate classification" ]][["value", "quantity_source_name"]] for key in cl_dict.keys(): if key in cl["quantity_source_name"].values: cl_dict[key] = cl.loc[cl["quantity_source_name"] == key]["value"].values[0] if key == "insitu": logging.info( f"insitu climate classification available: {self.file_path}" ) except KeyError: logging.info(f"No climate metadata found for {self.file_path}") metavars = [ MetaVar("lc_2000", lc_dict["CCI_landcover_2000"]), MetaVar("lc_2005", lc_dict["CCI_landcover_2005"]), MetaVar("lc_2010", lc_dict["CCI_landcover_2010"]), MetaVar("lc_insitu", lc_dict["insitu"]), MetaVar("climate_KG", cl_dict["koeppen_geiger_2007"]), MetaVar("climate_insitu", cl_dict["insitu"]), ] static_meta = { "saturation": self.__read_field(data, "saturation"), "clay_fraction": self.__read_field(data, "clay fraction", const.VARIABLE_LUT["cl_h"]), "sand_fraction": self.__read_field(data, "sand fraction", const.VARIABLE_LUT["sa_h"]), "silt_fraction": self.__read_field(data, "silt fraction", const.VARIABLE_LUT["si_h"]), "organic_carbon": self.__read_field(data, "organic carbon", const.VARIABLE_LUT["oc_h"]), } for name, v in static_meta.items(): if len(v) > 0: metavars += v else: metavars.append(MetaVar(name, const.CSV_META_TEMPLATE[name])) metadata = MetaData(metavars) return metadata
def _read_station_dir( root: Union[IsmnRoot, Path, str], stat_dir: Union[Path, str], temp_root: Path, ) -> (dict, list): """ Parallelizable function to read metadata for files in station dir """ infos = [] if not isinstance(root, IsmnRoot): proc_root = True root = IsmnRoot(root) else: proc_root = False csv = root.find_files(stat_dir, "*.csv") try: if len(csv) == 0: raise IsmnFileError( "Expected 1 csv file for station, found 0. " "Use empty static metadata." ) else: if len(csv) > 1: infos.append( f"Expected 1 csv file for station, found {len(csv)}. " f"Use first file in dir." ) static_meta_file = StaticMetaFile( root, csv[0], load_metadata=True, temp_root=temp_root ) station_meta = static_meta_file.metadata except IsmnFileError as e: infos.append(f"Error loading static meta for station: {e}") station_meta = MetaData([MetaVar(k, v) for k, v in CSV_META_TEMPLATE.items()]) data_files = root.find_files(stat_dir, "*.stm") filelist = [] for file_path in data_files: try: f = DataFile(root, file_path, temp_root=temp_root) except IOError as e: infos.append(f"Error loading ismn file: {e}") continue f.metadata.merge(station_meta, inplace=True) f.metadata = f.metadata.best_meta_for_depth( Depth( f.metadata["instrument"].depth.start, f.metadata["instrument"].depth.end, ) ) network = f.metadata["network"].val station = f.metadata["station"].val filelist.append((network, station, f)) infos.append(f"Processed file {file_path}") if proc_root: root.close() return filelist, infos
def from_metadata_csv( cls, data_root, meta_csv_file, network=None, temp_root=gettempdir() ): """ Load a previously created and stored filelist from :func:`ismn.filecollection.IsmnFileCollection.to_metadata_csv` Parameters ---------- data_root : IsmnRoot or str or Path Path where the ismn data is stored, can also be a zip file meta_csv_file : str or Path Csv file where the metadata is stored. network : list, optional (default: None) List of networks that are considered. Filehandlers for other networks are set to None. temp_root : str or Path, optional (default: gettempdir()) Temporary folder where extracted data is copied during reading from zip archive. """ if network is not None: network = np.atleast_1d(network) if isinstance(data_root, IsmnRoot): root = data_root else: root = IsmnRoot(data_root) print(f"Found existing ismn metadata in {meta_csv_file}.") metadata_df = _load_metadata_df(meta_csv_file) filelist = OrderedDict([]) all_networks = metadata_df["network"]["val"].values columns = np.array(list(metadata_df.columns)) for i, row in enumerate(metadata_df.values): # todo: slow!?? parallelise? this_nw = all_networks[i] if (network is not None) and not np.isin([this_nw], network)[0]: f = None continue else: vars = np.unique(columns[:-2][:, 0]) vals = row[:-2].reshape(-1, 3) metadata = MetaData( [ MetaVar.from_tuple( (vars[i], vals[i][2], vals[i][0], vals[i][1]) ) for i in range(len(vars)) ] ) f = DataFile( root=root, file_path=str(PurePosixPath(row[-2])), load_metadata=False, temp_root=temp_root, ) f.metadata = metadata f.file_type = row[-1] this_nw = f.metadata["network"].val if this_nw not in filelist.keys(): filelist[this_nw] = [] filelist[this_nw].append(f) if network is None: cls.metadata_df = metadata_df else: flags = np.isin(metadata_df["network"]["val"].values, network) cls.metadata_df = metadata_df.loc[flags] return cls(root, filelist=filelist)
def read_metadata(self) -> MetaData: """ Read csv file containing static variables into data frame. Returns ------- metadata : MetaData Static metadata read from csv file. """ if self.root.zip: if not self.root.isopen: self.root.open() with TemporaryDirectory(prefix='ismn', dir=self.temp_root) as tempdir: extracted = self.root.extract_file(self.file_path, tempdir) data = self.__read_csv(extracted) else: data = self.__read_csv(self.root.path / self.file_path) # read landcover classifications lc = data.loc[['land cover classification' ]][['value', 'quantity_source_name']] lc_dict = { 'CCI_landcover_2000': const.CSV_META_TEMPLATE['lc_2000'], 'CCI_landcover_2005': const.CSV_META_TEMPLATE['lc_2005'], 'CCI_landcover_2010': const.CSV_META_TEMPLATE['lc_2010'], 'insitu': const.CSV_META_TEMPLATE['lc_insitu'] } cl_dict = { 'koeppen_geiger_2007': const.CSV_META_TEMPLATE['climate_KG'], 'insitu': const.CSV_META_TEMPLATE['climate_insitu'] } for key in lc_dict.keys(): if key in lc['quantity_source_name'].values: if key != 'insitu': lc_dict[key] = np.int(lc.loc[lc['quantity_source_name'] == key]['value'].values[0]) else: lc_dict[key] = lc.loc[lc['quantity_source_name'] == key]['value'].values[0] logging.info( f'insitu land cover classification available: {self.file_path}' ) # read climate classifications cl = data.loc[['climate classification' ]][['value', 'quantity_source_name']] for key in cl_dict.keys(): if key in cl['quantity_source_name'].values: cl_dict[key] = cl.loc[cl['quantity_source_name'] == key]['value'].values[0] if key == 'insitu': logging.info( f'insitu climate classification available: {self.file_path}' ) metavars = [ MetaVar('lc_2000', lc_dict['CCI_landcover_2000']), MetaVar('lc_2005', lc_dict['CCI_landcover_2005']), MetaVar('lc_2010', lc_dict['CCI_landcover_2010']), MetaVar('lc_insitu', lc_dict['insitu']), MetaVar('climate_KG', cl_dict['koeppen_geiger_2007']), MetaVar('climate_insitu', cl_dict['insitu']), ] static_meta = { 'saturation': self.__read_field(data, 'saturation'), 'clay_fraction': self.__read_field(data, 'clay fraction', const.VARIABLE_LUT['cl_h']), 'sand_fraction': self.__read_field(data, 'sand fraction', const.VARIABLE_LUT['sa_h']), 'silt_fraction': self.__read_field(data, 'silt fraction', const.VARIABLE_LUT['si_h']), 'organic_carbon': self.__read_field(data, 'organic carbon', const.VARIABLE_LUT['oc_h']), } for name, v in static_meta.items(): if len(v) > 0: metavars += v else: metavars.append(MetaVar(name, const.CSV_META_TEMPLATE[name])) metadata = MetaData(metavars) return metadata
def from_metadata_csv(cls, data_root, meta_csv_file, network=None, temp_root=gettempdir()): """ Load a previously created and stored filelist from pkl. Parameters ---------- data_root : IsmnRoot or str or Path Path where the ismn data is stored, can also be a zip file meta_csv_file : str or Path Csv file where the metadata is stored. network : list, optional (default: None) List of networks that are considered. Other filehandlers are set to None. temp_root : str or Path, optional (default: gettempdir()) Temporary folder where extracted data is copied during reading from zip archive. """ if network is not None: network = np.atleast_1d(network) if isinstance(data_root, IsmnRoot): root = data_root else: root = IsmnRoot(data_root) print(f"Found existing ismn metadata in {meta_csv_file}.") metadata_df = pd.read_csv(meta_csv_file, index_col=0, header=[0, 1], low_memory=False, engine='c') # parse date cols as datetime for col in ['timerange_from', 'timerange_to']: metadata_df[col, 'val'] = pd.to_datetime(metadata_df[col, 'val']) lvars = [] for c in metadata_df.columns: if c[0] not in lvars: lvars.append(c[0]) # we assume triples for all vars except these, so they must be at the end assert lvars[-2:] == ['file_path', 'file_type'], \ "file_type and file_path must be at the end." filelist = OrderedDict([]) all_networks = metadata_df['network']['val'].values columns = np.array(list(metadata_df.columns)) for i, row in enumerate( metadata_df.values): # todo: slow!?? parallelise? this_nw = all_networks[i] if (network is not None) and not np.isin([this_nw], network)[0]: f = None continue else: vars = np.unique(columns[:-2][:, 0]) vals = row[:-2].reshape(-1, 3) metadata = MetaData([ MetaVar.from_tuple( (vars[i], vals[i][2], vals[i][0], vals[i][1])) for i in range(len(vars)) ]) f = DataFile(root=root, file_path=str(PurePosixPath(row[-2])), load_metadata=False, temp_root=temp_root) f.metadata = metadata f.file_type = row[-1] this_nw = f.metadata['network'].val if this_nw not in filelist.keys(): filelist[this_nw] = [] filelist[this_nw].append(f) return cls(root, filelist=filelist)