def _get_site_data(site: str, network: str) -> Tuple[float, float, float, List]: """Extract site location data from site attributes file. Args: site: Site code Returns: dict: Dictionary of site data """ from openghg.util import load_json network = network.upper() site = site.upper() site_info = load_json("acrg_site_info.json") try: site_data = site_info[site][network] latitude = float(site_data["latitude"]) longitute = float(site_data["longitude"]) site_height = float(site_data["height_station_masl"]) inlet_heights = site_data["height_name"] except KeyError as e: raise KeyError(f"Incorrect site or network : {e}") return latitude, longitute, site_height, inlet_heights
def _altitude_to_ecmwf_pressure(measure_pressure: List[float]) -> List[str]: """Find out what pressure levels are required from ERA5. Args: measure_pressure: List of pressures Returns: list: List of desired pressures """ from openghg.util import load_json ecmwf_metadata = load_json("ecmwf_dataset_info.json") dataset_metadata = ecmwf_metadata["datasets"] valid_levels = dataset_metadata["reanalysis_era5_pressure_levels"][ "valid_levels"] # Available ERA5 pressure levels era5_pressure_levels = np.array(valid_levels) # Match pressure to ERA5 pressure levels ecwmf_pressure_indices = np.zeros(len(measure_pressure) * 2) for index, m in enumerate(measure_pressure): ecwmf_pressure_indices[(index * 2):( index * 2 + 2)] = _two_closest_values(m - era5_pressure_levels) desired_era5_pressure = era5_pressure_levels[np.unique( ecwmf_pressure_indices).astype(int)] pressure_levels: List = desired_era5_pressure.astype(str).tolist() return pressure_levels
def verify_site(site: str) -> str: """Check if the passed site is a valid one and returns the three letter site code if found. Otherwise we use fuzzy text matching to suggest sites with similar names. Args: site: Three letter site code or site name Returns: str: Verified three letter site code if valid site """ from openghg.util import load_json, remove_punctuation from openghg.types import InvalidSiteError site_data = load_json("site_lookup.json") if site.upper() in site_data: return site.lower() else: site = remove_punctuation(site) name_lookup: Dict[str, str] = { value["short_name"]: code for code, value in site_data.items() } try: return name_lookup[site].lower() except KeyError: long_names = { value["long_name"]: code for code, value in site_data.items() } message = find_matching_site(site_name=site, possible_sites=long_names) raise InvalidSiteError(message)
def _synonyms(species: str) -> str: """ Check to see if there are other names that we should be using for a particular input. E.g. If CFC-11 or CFC11 was input, go on to use cfc-11, as this is used in species_info.json Args: species (str): Input string that you're trying to match Returns: str: Matched species string """ from openghg.util import load_json # Load in the species data species_data = load_json(filename="acrg_species_info.json") # First test whether site matches keys (case insensitive) matched_strings = [k for k in species_data if k.upper() == species.upper()] # Used to access the alternative names in species_data alt_label = "alt" # If not found, search synonyms if not matched_strings: for key in species_data: # Iterate over the alternative labels and check for a match matched_strings = [ s for s in species_data[key][alt_label] if s.upper() == species.upper() ] if matched_strings: matched_strings = [key] break if matched_strings: updated_species = str(matched_strings[0]) return updated_species else: raise ValueError(f"Unable to find synonym for species {species}")
def _site_info_attributes(site: str, network: Optional[str] = None) -> Dict: """Reads site attributes from JSON Args: site: Site code network: Network name Returns: dict: Dictionary of site attributes """ from openghg.util import load_json site = site.upper() # Read site info file data_filename = "acrg_site_info.json" site_params = load_json(filename=data_filename) if network is None: network = list(site_params[site].keys())[0] else: network = network.upper() attributes_dict = { "longitude": "station_longitude", "latitude": "station_latitude", "long_name": "station_long_name", "height_station_masl": "station_height_masl", } attributes = {} if site in site_params: for attr in attributes_dict: if attr in site_params[site][network]: attr_key = attributes_dict[attr] attributes[attr_key] = site_params[site][network][attr] else: raise ValueError(f"Invalid site {site} passed. Please use a valid site code such as BSD for Bilsdale") return attributes
def multiple_inlets(site: str) -> bool: """Check if the passed site has more than one inlet Args: site: Three letter site code Returns: bool: True if multiple inlets """ from openghg.util import load_json site_data = load_json("acrg_site_info.json") site = site.upper() network = next(iter(site_data[site])) try: heights = set(site_data[network]["height"]) except KeyError: try: heights = set(site_data[network]["height_name"]) except KeyError: return True return len(heights) > 1
def parse_btt( data_filepath: Union[str, Path], site: Optional[str] = "BTT", network: Optional[str] = "LGHG", inlet: Optional[str] = None, instrument: Optional[str] = None, ) -> Dict: """Reads NPL data files and returns the UUIDS of the Datasources the processed data has been assigned to Args: data_filepath: Path of file to load site: Site name Returns: dict: Dictionary of gas data """ from openghg.standardise.meta import assign_attributes from pandas import read_csv, Timestamp, to_timedelta, isnull from numpy import nan as np_nan from openghg.util import clean_string, load_json # TODO: Decide what to do about inputs which aren't use anywhere # at present - inlet, instrument, sampling_period, measurement_type data_filepath = Path(data_filepath) site = "BTT" # Rename these columns rename_dict = {"co2.cal": "CO2", "ch4.cal.ppb": "CH4"} # We only want these species species_extract = ["CO2", "CH4"] # Take std-dev measurements from these columns for these species species_sd = {"CO2": "co2.sd.ppm", "CH4": "ch4.sd.ppb"} param_data = load_json(filename="attributes.json") network_params = param_data["BTT"] sampling_period = int(network_params["sampling_period"]) sampling_period_seconds = str(sampling_period) + "s" data = read_csv(data_filepath) data["time"] = Timestamp("2019-01-01 00:00") + to_timedelta( data["DOY"] - 1, unit="D") data["time"] = data["time"].dt.round(sampling_period_seconds) data = data[~isnull(data.time)] data = data.rename(columns=rename_dict) data = data.set_index("time") gas_data = {} for species in species_extract: processed_data = data.loc[:, [species]].sort_index() # Create a variability column species_stddev_label = species_sd[species] processed_data[species][f"{species} variability"] = data[ species_stddev_label] # Replace any values below zero with NaNs processed_data[processed_data < 0] = np_nan # Drop NaNs processed_data = processed_data.dropna() # Convert to a Dataset processed_data = processed_data.to_xarray() site_attributes = network_params["global_attributes"] site_attributes["inlet_height_magl"] = network_params["inlet"] site_attributes["instrument"] = network_params["instrument"] site_attributes["sampling_period"] = sampling_period # TODO - add in better metadata reading metadata = { "species": clean_string(species), "sampling_period": str(sampling_period), "site": "BTT", } gas_data[species] = { "metadata": metadata, "data": processed_data, "attributes": site_attributes, } gas_data = assign_attributes(data=gas_data, site=site, network=network) return gas_data
def get_obs_surface( site: str, species: str, inlet: str = None, start_date: Optional[Union[str, Timestamp]] = None, end_date: Optional[Union[str, Timestamp]] = None, average: Optional[str] = None, network: Optional[str] = None, instrument: Optional[str] = None, calibration_scale: Optional[str] = None, keep_missing: Optional[bool] = False, skip_ranking: Optional[bool] = False, ) -> ObsData: """Get measurements from one site. Args: site: Site of interest e.g. MHD for the Mace Head site. species: Species identifier e.g. ch4 for methane. start_date: Output start date in a format that Pandas can interpret end_date: Output end date in a format that Pandas can interpret inlet: Inlet label average: Averaging period for each dataset. Each value should be a string of the form e.g. "2H", "30min" (should match pandas offset aliases format). keep_missing: Keep missing data points or drop them. network: Network for the site/instrument (must match number of sites). instrument: Specific instrument for the site (must match number of sites). calibration_scale: Convert to this calibration scale Returns: ObsData: ObsData object """ from pandas import Timestamp, Timedelta import numpy as np from xarray import concat as xr_concat from openghg.retrieve import search from openghg.store import recombine_datasets from openghg.util import clean_string, load_json, timestamp_tzaware site_info = load_json(filename="acrg_site_info.json") site = site.upper() if site not in site_info: raise ValueError( f"No site called {site}, please enter a valid site name.") # Find the correct synonym for the passed species species = clean_string(_synonyms(species)) # Get the observation data obs_results = search( site=site, species=species, inlet=inlet, start_date=start_date, end_date=end_date, instrument=instrument, find_all=True, skip_ranking=skip_ranking, ) if not obs_results: raise ValueError(f"Unable to find results for {species} at {site}") # TODO - for some reason mypy doesn't pick up the ObsData being returned here, look into this # GJ - 2021-07-19 retrieved_data: ObsData = obs_results.retrieve(site=site, species=species, inlet=inlet) # type: ignore data = retrieved_data.data if data.attrs["inlet"] == "multiple": data.attrs["inlet_height_magl"] = "multiple" retrieved_data.metadata["inlet"] = "multiple" if start_date is not None and end_date is not None: start_date_tzaware = timestamp_tzaware(start_date) end_date_tzaware = timestamp_tzaware(end_date) end_date_tzaware_exclusive = end_date_tzaware - Timedelta( 1, unit="nanosecond" ) # Deduct 1 ns to make the end day (date) exclusive. # Slice the data to only cover the dates we're interested in data = data.sel( time=slice(start_date_tzaware, end_date_tzaware_exclusive)) try: start_date_data = timestamp_tzaware(data.time[0].values) end_date_data = timestamp_tzaware(data.time[-1].values) except AttributeError: raise AttributeError( "This dataset does not have a time attribute, unable to read date range" ) if average is not None: # GJ - 2021-03-09 # TODO - check by RT # # Average the Dataset over a given period # if keep_missing is True: # # Create a dataset with one element and NaNs to prepend or append # ds_single_element = data[{"time": 0}] # for v in ds_single_element.variables: # if v != "time": # ds_single_element[v].values = np.nan # ds_concat = [] # # Pad with an empty entry at the start date # if timestamp_tzaware(data.time.min()) > start_date: # ds_single_element_start = ds_single_element.copy() # ds_single_element_start.time.values = Timestamp(start_date) # ds_concat.append(ds_single_element_start) # ds_concat.append(data) # # Pad with an empty entry at the end date # if data.time.max() < Timestamp(end_date): # ds_single_element_end = ds_single_element.copy() # ds_single_element_end.time.values = Timestamp(end_date) - Timedelta("1ns") # ds_concat.append(ds_single_element_end) # data = xr_concat(ds_concat, dim="time") # # Now sort to get everything in the right order # data = data.sortby("time") # First do a mean resample on all variables ds_resampled = data.resample(time=average).mean(skipna=False, keep_attrs=True) # keep_attrs doesn't seem to work for some reason, so manually copy ds_resampled.attrs = data.attrs.copy() average_in_seconds = Timedelta(average).total_seconds() ds_resampled.attrs["averaged_period"] = average_in_seconds ds_resampled.attrs["averaged_period_str"] = average # For some variables, need a different type of resampling data_variables: List[str] = [str(v) for v in data.variables] for var in data_variables: if "repeatability" in var: ds_resampled[var] = (np.sqrt( (data[var]**2).resample(time=average).sum()) / data[var].resample(time=average).count()) # Copy over some attributes if "long_name" in data[var].attrs: ds_resampled[var].attrs["long_name"] = data[var].attrs[ "long_name"] if "units" in data[var].attrs: ds_resampled[var].attrs["units"] = data[var].attrs["units"] # Create a new variability variable, containing the standard deviation within the resampling period ds_resampled[f"{species}_variability"] = (data[species].resample( time=average).std(skipna=False, keep_attrs=True)) # If there are any periods where only one measurement was resampled, just use the median variability ds_resampled[f"{species}_variability"][ ds_resampled[f"{species}_variability"] == 0.0] = ds_resampled[f"{species}_variability"].median() # Create attributes for variability variable ds_resampled[f"{species}_variability"].attrs[ "long_name"] = f"{data.attrs['long_name']}_variability" ds_resampled[f"{species}_variability"].attrs["units"] = data[ species].attrs["units"] # Resampling may introduce NaNs, so remove, if not keep_missing if keep_missing is False: ds_resampled = ds_resampled.dropna(dim="time") data = ds_resampled # Rename variables rename: Dict[str, str] = {} data_variables = [str(v) for v in data.variables] for var in data_variables: if var.lower() == species.lower(): rename[var] = "mf" if "repeatability" in var: rename[var] = "mf_repeatability" if "variability" in var: rename[var] = "mf_variability" if "number_of_observations" in var: rename[var] = "mf_number_of_observations" if "status_flag" in var: rename[var] = "status_flag" if "integration_flag" in var: rename[var] = "integration_flag" data = data.rename_vars(rename) # type: ignore data.attrs["species"] = species if "calibration_scale" in data.attrs: data.attrs["scale"] = data.attrs.pop("calibration_scale") if calibration_scale is not None: data = _scale_convert(data, species, calibration_scale) metadata = retrieved_data.metadata metadata.update(data.attrs) obs_data = ObsData(data=data, metadata=metadata) # It doesn't make sense to do this now as we've only got a single Dataset # # Now check if the units match for each of the observation Datasets # units = set((f.data.mf.attrs["units"] for f in obs_files)) # scales = set((f.data.attrs["scale"] for f in obs_files)) # if len(units) > 1: # raise ValueError( # f"Units do not match for these observation Datasets {[(f.mf.attrs['station_long_name'],f.attrs['units']) for f in obs_files]}" # ) # if len(scales) > 1: # print( # f"Scales do not match for these observation Datasets {[(f.mf.attrs['station_long_name'],f.attrs['units']) for f in obs_files]}" # ) # print("Suggestion: set calibration_scale to convert scales") return obs_data
def _read_data( data_filepath: Path, site: str, network: str, inlet: Optional[str] = None, instrument: Optional[str] = None, sampling_period: Optional[str] = None, measurement_type: Optional[str] = None, ) -> Dict: """Read the datafile passed in and extract the data we require. Args: data_filepath: Path to file site: Three letter site code network: Network name inlet: Inlet height instrument: Instrument name sampling_period: Sampling period including the unit (using pandas frequency aliases like '1H' or '1min') measurement_type: Measurement type e.g. insitu, flask Returns: dict: Dictionary of gas data """ from datetime import datetime from pandas import RangeIndex, read_csv, NaT import warnings from openghg.util import clean_string split_fname = data_filepath.stem.split(".") site = site.lower() try: site_fname = clean_string(split_fname[0]) inlet_fname = clean_string(split_fname[3]) except IndexError: raise ValueError( "Error reading metadata from filename, we expect a form hfd.picarro.1minute.100m.dat" ) if site_fname != site: raise ValueError( "Site mismatch between passed site code and that read from filename." ) if "m" not in inlet_fname: raise ValueError( "No inlet found, we expect filenames such as: bsd.picarro.1minute.108m.dat" ) if inlet is not None and inlet != inlet_fname: raise ValueError( "Inlet mismatch between passed inlet and that read from filename.") else: inlet = inlet_fname # Function to parse the datetime format found in the datafile def parse_date(date: str): # type: ignore try: return datetime.strptime(date, "%y%m%d %H%M%S") except ValueError: return NaT # Catch dtype warnings # TODO - look at setting dtypes - read header and data separately? with warnings.catch_warnings(): warnings.simplefilter("ignore") data = read_csv( data_filepath, header=None, skiprows=1, sep=r"\s+", index_col=["0_1"], parse_dates=[[0, 1]], date_parser=parse_date, ) data.index.name = "time" # Drop any rows with NaNs # This is now done before creating metadata data = data.dropna(axis="rows", how="any") # Get the number of gases in dataframe and number of columns of data present for each gas n_gases, n_cols = _gas_info(data=data) header = data.head(2) skip_cols = sum([header[column][0] == "-" for column in header.columns]) metadata = _read_metadata(filepath=data_filepath, data=data) if network is not None: metadata["network"] = network if sampling_period is not None: # Compare against value extracted from the file name file_sampling_period = Timedelta(seconds=metadata["sampling_period"]) comparison_seconds = abs(sampling_period - file_sampling_period).total_seconds() tolerance_seconds = 1 if comparison_seconds > tolerance_seconds: raise ValueError( f"Input sampling period {sampling_period} does not match to value " f"extracted from the file name of {metadata['sampling_period']} seconds." ) # Read the scale from JSON # I'll leave this here for the possible future movement from class to functions network_metadata = load_json(filename="process_gcwerks_parameters.json") crds_metadata = network_metadata["CRDS"] # This dictionary is used to store the gas data and its associated metadata combined_data = {} for n in range(n_gases): # Slice the columns gas_data = data.iloc[:, skip_cols + n * n_cols:skip_cols + (n + 1) * n_cols] # Reset the column numbers gas_data.columns = RangeIndex(gas_data.columns.size) species = gas_data[0][0] species = species.lower() column_labels = [ species, f"{species}_variability", f"{species}_number_of_observations", ] # Name columns gas_data = gas_data.set_axis(column_labels, axis="columns", inplace=False) header_rows = 2 # Drop the first two rows now we have the name gas_data = gas_data.drop(index=gas_data.head(header_rows).index, inplace=False) # Cast data to float64 / double gas_data = gas_data.astype("float64") # Here we can convert the Dataframe to a Dataset and then write the attributes gas_data = gas_data.to_xarray() site_attributes = _get_site_attributes(site=site, inlet=inlet, crds_metadata=crds_metadata) scale = crds_metadata["default_scales"].get(species.upper(), "NA") # Create a copy of the metadata dict species_metadata = metadata.copy() species_metadata["species"] = clean_string(species) species_metadata["inlet"] = inlet species_metadata["scale"] = scale species_metadata["long_name"] = site_attributes["long_name"] combined_data[species] = { "metadata": species_metadata, "data": gas_data, "attributes": site_attributes, } return combined_data
def parse_gcwerks( data_filepath: Union[str, Path], precision_filepath: Union[str, Path], site: str, network: str, inlet: Optional[str] = None, instrument: Optional[str] = None, sampling_period: Optional[str] = None, measurement_type: Optional[str] = None, ) -> Dict: """Reads a GC data file by creating a GC object and associated datasources Args: data_filepath: Path of data file precision_filepath: Path of precision file site: Three letter code or name for site instrument: Instrument name network: Network name Returns: dict: Dictionary of source_name : UUIDs """ from pathlib import Path from openghg.standardise.meta import assign_attributes from openghg.util import clean_string, load_json data_filepath = Path(data_filepath) precision_filepath = Path(precision_filepath) # Do some setup for processing # Load site data gcwerks_data = load_json(filename="process_gcwerks_parameters.json") gc_params = gcwerks_data["GCWERKS"] network = clean_string(network) # We don't currently do anything with inlet here as it's always read from data # or taken from process_gcwerks_parameters.json if inlet is not None: inlet = clean_string(inlet) if instrument is not None: instrument = clean_string(instrument) # Check if the site code passed matches that read from the filename site = _check_site( filepath=data_filepath, site_code=site, gc_params=gc_params, ) # If we're not passed the instrument name and we can't find it raise an error if instrument is None: instrument = _check_instrument(filepath=data_filepath, gc_params=gc_params, should_raise=True) else: fname_instrument = _check_instrument(filepath=data_filepath, gc_params=gc_params, should_raise=False) if fname_instrument is not None and instrument != fname_instrument: raise ValueError( f"Mismatch between instrument passed as argument {instrument} and instrument read from filename {fname_instrument}" ) instrument = str(instrument) gas_data = _read_data( data_filepath=data_filepath, precision_filepath=precision_filepath, site=site, instrument=instrument, network=network, sampling_period=sampling_period, gc_params=gc_params, ) # Assign attributes to the data for CF compliant NetCDFs gas_data = assign_attributes(data=gas_data, site=site) return gas_data
def get_attributes( ds: Dataset, species: str, site: str, network: str = None, global_attributes: Dict[str, str] = None, units: str = None, scale: str = None, sampling_period: str = None, date_range: List[str] = None, ) -> Dataset: """ This function writes attributes to an xarray.Dataset so that they conform with the CF Convention v1.6 Attributes of the xarray DataSet are modified, and variable names are changed If the species is a standard mole fraction then either: - species name will used in lower case in the file and variable names but with any hyphens taken out - name will be changed according to the species_translator dictionary If the species is isotopic data or a non-standard variable (e.g. APO): - Isotopes species names should begin with a "D" (Annoyingly, the code currently picks up "Desflurane" too. I've fixed this for now, but if we get a lot of other "D" species, we should make this better) - I suggest naming for isotopologues should be d<species><isotope>, e.g. dCH4C13, or dCO2C14 - Any non-standard variables should be listed in the species_translator dictionary Args: ds: Should contain variables such as "ch4", "ch4 repeatability". Must have a "time" dimension. species: Species name. e.g. "CH4", "HFC-134a", "dCH4C13" site: Three-letter site code network: Network site is associated with global_attribuates: Dictionary containing any info you want to add to the file header (e.g. {"Contact": "Contact_Name"}) units: This routine will try to guess the units unless this is specified. Options are in units_interpret scale: Calibration scale for species. sampling_period: Number of seconds for which air sample is taken. Only for time variable attribute date_range: Start and end date for output If you only want an end date, just put a very early start date (e.g. ["1900-01-01", "2010-01-01"]) """ from pandas import Timestamp as pd_Timestamp from openghg.util import clean_string, load_json, timestamp_now # from numpy import unique as np_unique if not isinstance(ds, Dataset): raise TypeError("This function only accepts xarray Datasets") # Current CF Conventions (v1.7) demand that valid variable names # begin with a letter and be composed of letters, digits and underscores # Here variable names are also made lowercase to enable easier matching below # TODO - could I just cast ds.variables as as type for mypy instead of doing this? # variable_names = [str(v) for v in ds.variables] # Is this better? variable_names = cast(Dict[str, Any], ds.variables) to_underscores = {var: var.lower().replace(" ", "_") for var in variable_names} ds = ds.rename(to_underscores) # type: ignore species_attrs = load_json(filename="species_attributes.json") attributes_data = load_json("attributes.json") species_translator = attributes_data["species_translation"] unit_species = attributes_data["unit_species"] unit_species_long = attributes_data["unit_species_long"] unit_interpret = attributes_data["unit_interpret"] species_upper = species.upper() species_lower = species.lower() variable_names = cast(Dict[str, Any], ds.variables) matched_keys = [var for var in variable_names if species_lower in var] # If we don't have any variables to rename, raise an error if not matched_keys: raise NameError(f"Cannot find species {species} in Dataset variables") species_rename = {} for var in matched_keys: try: species_label = species_translator[species_upper]["chem"] except KeyError: species_label = clean_string(species_lower) species_rename[var] = var.replace(species_lower, species_label) ds = ds.rename(species_rename) # type: ignore # Global attributes global_attributes_default = { "conditions_of_use": "Ensure that you contact the data owner at the outset of your project.", "source": "In situ measurements of air", "Conventions": "CF-1.6", } if global_attributes is not None: # TODO - for some reason mypy doesn't see a Dict[str,str] as a valid Mapping[Hashable, Any] type global_attributes.update(global_attributes_default) # type: ignore else: global_attributes = global_attributes_default global_attributes["file_created"] = str(timestamp_now()) global_attributes["processed_by"] = "OpenGHG_Cloud" global_attributes["species"] = species_label if scale is None: global_attributes["calibration_scale"] = "unknown" else: global_attributes["calibration_scale"] = scale # Update the Dataset attributes ds.attrs.update(global_attributes) # type: ignore # Add some site attributes site_attributes = _site_info_attributes(site.upper(), network) ds.attrs.update(site_attributes) # Species-specific attributes # Long name if species_upper.startswith("D") and species_upper != "DESFLURANE" or species_upper == "APD": sp_long = species_translator[species_upper]["name"] elif species_upper == "RN": sp_long = "radioactivity_concentration_of_222Rn_in_air" elif species_upper in species_translator: name = species_translator[species_upper]["name"] sp_long = f"mole_fraction_of_{name}_in_air" else: sp_long = f"mole_fraction_of_{species_label}_in_air" ancillary_variables = [] variable_names = cast(Dict[str, Any], ds.variables) matched_keys = [var for var in variable_names if species_lower in var.lower()] # Write units as attributes to variables containing any of these match_words = ["variability", "repeatability", "stdev", "count"] for key in variable_names: key = key.lower() if species_label.lower() in key: # Standard name attribute # ds[key].attrs["standard_name"]=key.replace(species_label, sp_long) ds[key].attrs["long_name"] = key.replace(species_label, sp_long) # If units are required for variable, add attribute if key == species_label or any(word in key for word in match_words): if units is not None: if units in unit_interpret: ds[key].attrs["units"] = unit_interpret[units] else: ds[key].attrs["units"] = unit_interpret["else"] else: # TODO - merge these species attributes into a single simpler JSON try: ds[key].attrs["units"] = unit_species[species_upper] except KeyError: try: ds[key].attrs["units"] = species_attrs[species_label.upper()]["units"] except KeyError: ds[key].attrs["units"] = "NA" # If units are non-standard, add explanation if species_upper in unit_species_long: ds[key].attrs["units_description"] = unit_species_long[species_upper] # Add to list of ancilliary variables if key != species_label: ancillary_variables.append(key) # TODO - for the moment skip this step - check status of ancilliary variables in standard # Write ancilliary variable list # ds[species_label].attrs["ancilliary_variables"] = ", ".join(ancillary_variables) # Add quality flag attributes # NOTE - I've removed the whitespace before status_flag and integration_flag here variable_names = cast(Dict[str, Any], ds.variables) quality_flags = [key for key in variable_names if "status_flag" in key] # Not getting long_name for c2f6 for key in quality_flags: ds[key] = ds[key].astype(int) try: long_name = ds[species_label].attrs["long_name"] except KeyError: raise KeyError(key, quality_flags) ds[key].attrs = { "flag_meaning": "0 = unflagged, 1 = flagged", "long_name": f"{long_name} status_flag", } variable_names = cast(Dict[str, Any], ds.variables) # Add integration flag attributes integration_flags = [key for key in variable_names if "integration_flag" in key] for key in integration_flags: ds[key] = ds[key].astype(int) long_name = ds[species_label].attrs["long_name"] ds[key].attrs = { "flag_meaning": "0 = area, 1 = height", "standard_name": f"{long_name} integration_flag", "comment": "GC peak integration method (by height or by area). Does not indicate data quality", } # Set time encoding # Check if there are duplicate time stamps # I feel there should be a more pandas way of doing this # but xarray doesn't currently have a duplicates method # See this https://github.com/pydata/xarray/issues/2108 # if len(set(ds.time.values)) < len(ds.time.values): # if len(np_unique(ds.time.values)) < len(ds.time.values): # print("WARNING. Duplicate time stamps") first_year = pd_Timestamp(str(ds.time[0].values)).year ds.time.encoding = {"units": f"seconds since {str(first_year)}-01-01 00:00:00"} time_attributes: Dict[str, str] = {} time_attributes["label"] = "left" time_attributes["standard_name"] = "time" time_attributes["comment"] = ( "Time stamp corresponds to beginning of sampling period. " + "Time since midnight UTC of reference date. " + "Note that sampling periods are approximate." ) if sampling_period is not None: time_attributes["sampling_period_seconds"] = sampling_period ds.time.attrs.update(time_attributes) # If a date range is specified, slice dataset if date_range: ds = ds.loc[dict(time=slice(*date_range))] return ds
def _read_raw_data( data_filepath: Path, species: str, inlet: str, sampling_period: str, measurement_type: str = "flask", ) -> Dict: """Separates the gases stored in the dataframe in separate dataframes and returns a dictionary of gases with an assigned UUID as gas:UUID and a list of the processed dataframes Args: data_filepath: Path of datafile species: Species string such as CH4, CO measurement_type: Type of measurements e.g. flask Returns: dict: Dictionary containing attributes, data and metadata keys """ from openghg.util import clean_string, read_header, load_json from pandas import read_csv, Timestamp header = read_header(filepath=data_filepath) column_names = header[-1][14:].split() def date_parser(year: str, month: str, day: str, hour: str, minute: str, second: str) -> Timestamp: return Timestamp(year, month, day, hour, minute, second) date_parsing = { "time": [ "sample_year", "sample_month", "sample_day", "sample_hour", "sample_minute", "sample_seconds", ] } data_types = { "sample_year": int, "sample_month": int, "sample_day": int, "sample_hour": int, "sample_minute": int, "sample_seconds": int, } # Number of header lines to skip n_skip = len(header) data = read_csv( data_filepath, skiprows=n_skip, names=column_names, sep=r"\s+", dtype=data_types, parse_dates=date_parsing, date_parser=date_parser, index_col="time", skipinitialspace=True, ) # Drop duplicates data = data.loc[~data.index.duplicated(keep="first")] # Check if the index is sorted if not data.index.is_monotonic_increasing: data = data.sort_index() # Read the site code from the Dataframe site = str(data["sample_site_code"][0]).upper() site_data = load_json("acrg_site_info.json") # If this isn't a site we recognize try and read it from the filename if site not in site_data: site = str(data_filepath.name).split("_")[1].upper() if site not in site_data: raise ValueError(f"The site {site} is not recognized.") if species is not None: # If we're passed a species ensure that it is in fact the correct species data_species = str(data["parameter_formula"].values[0]).lower() passed_species = species.lower() if data_species != passed_species: raise ValueError( f"Mismatch between passed species ({passed_species}) and species read from data ({data_species})" ) species = species.upper() flag = [] selection_flag = [] for flag_str in data.analysis_flag: flag.append(flag_str[0] == ".") selection_flag.append(int(flag_str[1] != ".")) combined_data = {} data[species + "_status_flag"] = flag data[species + "_selection_flag"] = selection_flag data = data[data[species + "_status_flag"]] data = data[[ "sample_latitude", "sample_longitude", "sample_altitude", "analysis_value", "analysis_uncertainty", species + "_selection_flag", ]] rename_dict = { "analysis_value": species, "analysis_uncertainty": species + "_repeatability", "sample_longitude": "longitude", "sample_latitude": "latitude", "sample_altitude": "altitude", } data = data.rename(columns=rename_dict, inplace=False) data = data.to_xarray() # TODO - this could do with a better name noaa_params = load_json("attributes.json")["NOAA"] site_attributes = noaa_params["global_attributes"] site_attributes["inlet_height_magl"] = "NA" site_attributes["instrument"] = noaa_params["instrument"][species.upper()] site_attributes["sampling_period"] = sampling_period metadata = {} metadata["species"] = clean_string(species) metadata["site"] = site metadata["measurement_type"] = measurement_type metadata["network"] = "NOAA" metadata["inlet"] = inlet metadata["sampling_period"] = sampling_period metadata["instrument"] = noaa_params["instrument"][species.upper()] combined_data[species.lower()] = { "metadata": metadata, "data": data, "attributes": site_attributes, } return combined_data
def parse_npl( data_filepath: pathType, site: str = "NPL", network: str = "LGHG", inlet: str = None, instrument: str = None, sampling_period: str = None, measurement_type: str = None, ) -> Dict: """Reads NPL data files and returns the UUIDS of the Datasources the processed data has been assigned to Args: data_filepath: Path of file to load site: Site name Returns: list: UUIDs of Datasources data has been assigned to """ if sampling_period is None: sampling_period = "NOT_SET" data_filepath = Path(data_filepath) site = "NPL" attributes_data = load_json(filename="attributes.json") npl_params = attributes_data["NPL"] # mypy doesn't like NaT or NaNs - look into this def parser(date: str): # type: ignore try: return datetime.strptime(str(date), "%d/%m/%Y %H:%M") except ValueError: return NaT data = read_csv(data_filepath, index_col=0, date_parser=parser) # Drop the NaT/NaNs data = data.loc[data.index.dropna()] # Rename columns rename_dict = {"Cal_CO2_dry": "CO2", "Cal_CH4_dry": "CH4"} data = data.rename(columns=rename_dict) data.index.name = "time" if inlet is None: inlet = "NA" gas_data = {} for species in data.columns: processed_data = data.loc[:, [species]].sort_index().to_xarray() # Convert methane to ppb if species == "CH4": processed_data[species] *= 1000 # No averaging applied to raw obs, set variability to 0 to allow get_obs to calculate # when averaging processed_data["{} variability".format( species)] = processed_data[species] * 0.0 site_attributes = npl_params["global_attributes"] site_attributes["inlet_height_magl"] = npl_params["inlet"] site_attributes["instrument"] = npl_params["instrument"] metadata = { "species": clean_string(species), "sampling_period": str(sampling_period), "site": "NPL", "network": "LGHG", "inlet": inlet, } # TODO - add in better metadata reading gas_data[species] = { "metadata": metadata, "data": processed_data, "attributes": site_attributes, } gas_data = assign_attributes(data=gas_data, site=site, network=network) return gas_data
def search(**kwargs): # type: ignore """Search for observations data. Any keyword arguments may be passed to the the function and these keywords will be used to search the metadata associated with each Datasource. Example / commonly used arguments are given below. Args: species: Terms to search for in Datasources locations: Where to search for the terms in species inlet: Inlet height such as 100m instrument: Instrument name such as picarro find_all: Require all search terms to be satisfied start_date: Start datetime for search. If None a start datetime of UNIX epoch (1970-01-01) is set end_date: End datetime for search. If None an end datetime of the current datetime is set skip_ranking: If True skip ranking system, defaults to False Returns: dict: List of keys of Datasources matching the search parameters """ from addict import Dict as aDict from copy import deepcopy from itertools import chain as iter_chain from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel from openghg.store.base import Datasource from openghg.util import ( timestamp_now, timestamp_epoch, timestamp_tzaware, clean_string, closest_daterange, find_daterange_gaps, split_daterange_str, load_json, ) from openghg.dataobjects import SearchResults # Get a copy of kwargs as we make some modifications below kwargs_copy = deepcopy(kwargs) # Do this here otherwise we have to produce them for every datasource start_date = kwargs.get("start_date") end_date = kwargs.get("end_date") if start_date is None: start_date = timestamp_epoch() else: start_date = timestamp_tzaware(start_date) if end_date is None: end_date = timestamp_now() else: end_date = timestamp_tzaware(end_date) kwargs_copy["start_date"] = start_date kwargs_copy["end_date"] = end_date skip_ranking = kwargs_copy.get("skip_ranking", False) try: del kwargs_copy["skip_ranking"] except KeyError: pass # As we might have kwargs that are None we want to get rid of those search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None} # Speices translation species = search_kwargs.get("species") if species is not None: if not isinstance(species, list): species = [species] translator = load_json("species_translator.json") updated_species = [] for s in species: updated_species.append(s) try: translated = translator[s] except KeyError: pass else: updated_species.extend(translated) search_kwargs["species"] = updated_species data_type = search_kwargs.get("data_type", "timeseries") valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model") if data_type not in valid_data_types: raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}") # Assume we want timeseries data obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load() if data_type == "footprints": obj = Footprints.load() elif data_type == "emissions": obj = Emissions.load() elif data_type == "eulerian_model": obj = EulerianModel.load() datasource_uuids = obj.datasources() # Shallow load the Datasources so we can search their metadata datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) # For the time being this will return a dict until we know how best to represent # the footprints and emissions results in a SearchResult object if data_type in {"emissions", "footprints", "eulerian_model"}: sources: Dict = aDict() for datasource in datasources: if datasource.search_metadata(**search_kwargs): uid = datasource.uuid() sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) sources[uid]["metadata"] = datasource.metadata() return sources # Find the Datasources that contain matching metadata matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)} # TODO - Update this as it only uses the ACRG repo JSON at the moment # Check if this site only has one inlet, if so skip ranking # if "site" in search_kwargs: # site = search_kwargs["site"] # if not isinstance(site, list) and not multiple_inlets(site=site): # skip_ranking = True # If there isn't *any* ranking data at all, skip all the ranking functionality if not obj._rank_data: skip_ranking = True # If only one datasource has been returned, skip all the ranking functionality if len(matching_sources) == 1: skip_ranking = True # If we have the site, inlet and instrument then just return the data # TODO - should instrument be added here if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True: specific_sources = aDict() for datasource in matching_sources.values(): specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date) if not specific_keys: continue metadata = datasource.metadata() site = metadata["site"] species = metadata["species"] inlet = metadata["inlet"] specific_sources[site][species][inlet]["keys"] = specific_keys specific_sources[site][species][inlet]["metadata"] = metadata return SearchResults(results=specific_sources.to_dict(), ranked_data=False) highest_ranked = aDict() for uid, datasource in matching_sources.items(): # Find the site and then the ranking metadata = datasource.metadata() # Get the site inlet and species site = metadata["site"] species = metadata["species"] rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date) # If this Datasource doesn't have any ranking data skip it and move on if not rank_data: continue # There will only be a single rank key rank_value = next(iter(rank_data)) # Get the daterange this rank covers rank_dateranges = rank_data[rank_value] # Each match we store gives us the information we need # to retrieve the data match = {"uuid": uid, "dateranges": rank_dateranges} # Need to ensure we get all the dates covered if species in highest_ranked[site]: species_rank_data = highest_ranked[site][species] # If we have a higher (lower number) rank save it if rank_value < species_rank_data["rank"]: species_rank_data["rank"] = rank_value species_rank_data["matching"] = [match] # If another Datasource has the same rank for another daterange # we want to save that as well elif rank_value == species_rank_data["rank"]: species_rank_data["matching"].append(match) else: highest_ranked[site][species]["rank"] = rank_value highest_ranked[site][species]["matching"] = [match] if not highest_ranked: raise ValueError( ( "No ranking data set for the given search parameters." " Please refine your search to include a specific site, species and inlet." ) ) # Now we have the highest ranked data the dateranges there are ranks for # we want to fill in the gaps with (currently) the highest inlet from that site # We just want some rank_metadata to go along with the final data scheme # Can key a key of date - inlet data_keys: Dict = aDict() for site, species in highest_ranked.items(): for sp, data in species.items(): # data_keys[site][sp]["keys"] = [] species_keys = [] species_rank_data = {} species_metadata = {} for match_data in data["matching"]: uuid = match_data["uuid"] match_dateranges = match_data["dateranges"] # Get the datasource as it's already in the dictionary # we created earlier datasource = matching_sources[uuid] metadata = datasource.metadata() inlet = metadata["inlet"] keys = [] for dr in match_dateranges: date_keys = datasource.keys_in_daterange_str(daterange=dr) if date_keys: keys.extend(date_keys) # We'll add this to the metadata in the search results we return at the end species_rank_data[dr] = inlet species_keys.extend(keys) species_metadata[inlet] = metadata # Only create the dictionary keys if we have some data keys if species_keys: data_keys[site][sp]["keys"] = species_keys data_keys[site][sp]["rank_metadata"] = species_rank_data data_keys[site][sp]["metadata"] = species_metadata else: continue # We now need to retrieve data for the dateranges for which we don't have ranking data # To do this find the gaps in the daterange over which the user has requested data # and the dates for which we have ranking information # Get the dateranges that are covered by ranking information daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]])) # Find the gaps in the ranking coverage gap_dateranges = find_daterange_gaps( start_search=start_date, end_search=end_date, dateranges=daterange_strs ) # We want the dateranges and inlets for those dateranges inlet_dateranges = data_keys[site][sp]["rank_metadata"] # These are the dateranges for which we have ranking information for this site and species ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys()) for gap_daterange in gap_dateranges: # We want to select the inlet that's ranked for dates closest to the ones we have here closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges) gap_start, gap_end = split_daterange_str(gap_daterange) # Find the closest ranked inlet by date chosen_inlet = inlet_dateranges[closest_dr] inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet] inlet_instrument = inlet_metadata["instrument"] inlet_sampling_period = inlet_metadata["sampling_period"] # Then we want to retrieve the correct metadata for those inlets results: SearchResults = search( site=site, species=sp, inlet=chosen_inlet, instrument=inlet_instrument, sampling_period=inlet_sampling_period, start_date=gap_start, end_date=gap_end, ) # type: ignore if not results: continue # Retrieve the data keys inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet) data_keys[site][sp]["keys"].extend(inlet_data_keys) # Remove any duplicate keys data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"])) # TODO - create a stub for addict dict_data_keys = data_keys.to_dict() # type: ignore return SearchResults(results=dict_data_keys, ranked_data=True)
def parse_beaco2n( data_filepath: Union[str, Path], site: str, network: str, inlet: str, instrument: Optional[str] = "shinyei", sampling_period: Optional[str] = None, ) -> Dict: """Read BEACO2N data files Args: data_filepath: Data filepath site: Site name network: Network name inlet: Inlet height in metres instrument: Instrument name sampling_period: Measurement sampling period Returns: dict: Dictionary of data """ import pandas as pd from openghg.util import load_json from collections import defaultdict from openghg.util import clean_string if sampling_period is None: sampling_period = "NOT_SET" data_filepath = Path(data_filepath) datetime_columns = {"time": ["datetime"]} use_cols = [1, 5, 6, 7, 8, 9, 10] na_values = [-999.0] site = clean_string(site) try: data = pd.read_csv( data_filepath, index_col="time", usecols=use_cols, parse_dates=datetime_columns, na_values=na_values, ) except ValueError as e: raise ValueError( f"Unable to read data file, please make sure it is in the standard BEACO2N format.\nError: {e}" ) beaco2n_site_data = load_json("beaco2n_site_data.json") try: site_metadata = beaco2n_site_data[site.upper()] except KeyError: raise ValueError(f"Site {site} not recognized.") site_metadata["comment"] = "Retrieved from http://beacon.berkeley.edu/" # Check which columns we have in the data and build the rename dict possible_rename_cols = { "PM_ug/m3": "pm", "PM_ug/m3_QC_level": "pm_qc", "co2_ppm": "co2", "co2_ppm_QC_level": "co2_qc", "co_ppm": "co", "co_ppm_QC_level": "co_qc", } # Not all columns are in data from different sites, i.e. Glasgow has a CO column rename_cols = {k: v for k, v in possible_rename_cols.items() if k in data} # Set all values below zero to NaN data = data.rename(columns=rename_cols) # Read the columns available and make sure we have them to iterate over possible_measurement_types = ["pm", "co", "co2"] measurement_types = [c for c in possible_measurement_types if c in data] units = {"pm": "ug/m3", "co2": "ppm", "co": "ppm"} gas_data: DefaultDict[str, Dict[str, Union[DataFrame, Dict]]] = defaultdict(dict) for mt in measurement_types: m_data = data[[mt, f"{mt}_qc"]] m_data = m_data.dropna(axis="rows", subset=[mt]) # Some sites don't have data for each type, skip that type if all NaNs if m_data.index.empty: continue m_data = m_data.to_xarray() species_metadata = { "units": units[mt], "site": site, "species": clean_string(mt), "inlet": clean_string(inlet), "network": "beaco2n", "sampling_period": str(sampling_period), "instrument": instrument, } gas_data[mt]["data"] = m_data gas_data[mt]["metadata"] = species_metadata gas_data[mt]["attributes"] = site_metadata # TODO - add CF Compliant attributes? return gas_data
def parse_eurocom( data_filepath: Union[str, Path], site: str, sampling_period: str, network: Optional[str] = None, inlet: Optional[str] = None, instrument: Optional[str] = None, ) -> Dict: """Parses EUROCOM data files into a format expected by OpenGHG Args: data_filepath: Path of file to read site: Site code sampling_period: Sampling period in seconds network: Network name Inlet: Inlet height in metres Instrument: Instrument name Returns: dict: Dictionary of measurement data """ from openghg.standardise.meta import assign_attributes, get_attributes from pandas import read_csv, Timestamp from openghg.util import read_header, load_json data_filepath = Path(data_filepath) if site is None: site = data_filepath.stem.split("_")[0] if sampling_period is None: sampling_period = "NOT_SET" data_filepath = Path(data_filepath) filename = data_filepath.name inlet_height = filename.split("_")[1] if "m" not in inlet_height: inlet_height = "NA" # This dictionary is used to store the gas data and its associated metadata combined_data = {} # Read the header as lines starting with # header = read_header(data_filepath, comment_char="#") n_skip = len(header) - 1 species = "co2" def date_parser(year: str, month: str, day: str, hour: str, minute: str) -> Timestamp: return Timestamp(year=year, month=month, day=day, hour=hour, minute=minute) datetime_columns = {"time": ["Year", "Month", "Day", "Hour", "Minute"]} use_cols = [ "Day", "Month", "Year", "Hour", "Minute", str(species.lower()), "SamplingHeight", "Stdev", "NbPoints", ] dtypes = { "Day": int, "Month": int, "Year": int, "Hour": int, "Minute": int, species.lower(): float, "Stdev": float, "SamplingHeight": float, "NbPoints": int, } data = read_csv( data_filepath, skiprows=n_skip, parse_dates=datetime_columns, date_parser=date_parser, index_col="time", sep=";", usecols=use_cols, dtype=dtypes, na_values="-999.99", ) data = data[data[species.lower()] >= 0.0] data = data.dropna(axis="rows", how="any") # Drop duplicate indices data = data.loc[~data.index.duplicated(keep="first")] # Convert to xarray Dataset data = data.to_xarray() attributes_data = load_json(filename="attributes.json") eurocom_attributes = attributes_data["EUROCOM"] global_attributes = eurocom_attributes["global_attributes"] if inlet_height == "NA": try: inlet = eurocom_attributes["intake_height"][site] global_attributes["inlet_height_m"] = inlet calibration_scale = eurocom_attributes["calibration"][site] except KeyError: calibration_scale = {} raise ValueError( f"Unable to find inlet from filename or attributes file for {site}" ) gas_data = get_attributes( ds=data, species=species, site=site, global_attributes=global_attributes, units="ppm", ) # Create a copy of the metadata dict metadata = {} metadata["site"] = site metadata["species"] = species metadata["inlet_height"] = global_attributes["inlet_height_m"] metadata["calibration_scale"] = calibration_scale metadata["network"] = "EUROCOM" metadata["sampling_period"] = str(sampling_period) combined_data[species] = { "metadata": metadata, "data": gas_data, "attributes": global_attributes, } combined_data = assign_attributes(data=combined_data, site=site, sampling_period=sampling_period) return combined_data
def _split_species( data: DataFrame, site: str, instrument: str, species: List, metadata: Dict, units: Dict, scale: Dict, gc_params: Dict, ) -> Dict: """Splits the species into separate dataframe into sections to be stored within individual Datasources Args: data: DataFrame of raw data site: Name of site from which this data originates instrument: Name of instrument species: List of species contained in data metadata: Dictionary of metadata units: Dictionary of units for each species scale: Dictionary of scales for each species gc_params: GCWERKS parameter dictionary Returns: dict: Dataframe of gas data and metadata """ from addict import Dict as aDict from fnmatch import fnmatch from openghg.util import load_json, clean_string # Load species translator so we can keep species names consistent attributes_data = load_json("attributes.json") species_translator = attributes_data["species_translation"] # Read inlets from the parameters expected_inlets = _get_inlets(site_code=site, gc_params=gc_params) try: data_inlets = data["Inlet"].unique().tolist() except KeyError: raise KeyError( "Unable to read inlets from data, please ensure this data is of the GC type expected by this retrieve module" ) combined_data = aDict() for spec in species: # Skip this species if the data is all NaNs if data[spec].isnull().all(): continue # Here inlet is the inlet in the data and inlet_label is the label we want to use as metadata for inlet, inlet_label in expected_inlets.items(): # Create a copy of metadata for local modification spec_metadata = metadata.copy() spec_metadata["units"] = units[spec] spec_metadata["scale"] = scale[spec] # If we've only got a single inlet if inlet == "any" or inlet == "air": spec_data = data[[ spec, spec + " repeatability", spec + " status_flag", spec + " integration_flag", "Inlet", ]] spec_data = spec_data.dropna(axis="index", how="any") spec_metadata["inlet"] = inlet_label elif "date" in inlet: dates = inlet.split("_")[1:] data_sliced = data.loc[dates[0]:dates[1]] spec_data = data_sliced[[ spec, spec + " repeatability", spec + " status_flag", spec + " integration_flag", "Inlet", ]] spec_data = spec_data.dropna(axis="index", how="any") spec_metadata["inlet"] = inlet_label else: # Find the inlet matching_inlets = [i for i in data_inlets if fnmatch(i, inlet)] if not matching_inlets: continue # Only set the label in metadata when we have the correct label spec_metadata["inlet"] = inlet_label # There should only be one matching label select_inlet = matching_inlets[0] # Take only data for this inlet from the dataframe inlet_data = data.loc[data["Inlet"] == select_inlet] spec_data = inlet_data[[ spec, spec + " repeatability", spec + " status_flag", spec + " integration_flag", "Inlet", ]] spec_data = spec_data.dropna(axis="index", how="any") # Now we drop the inlet column spec_data = spec_data.drop("Inlet", axis="columns") # Check that the Dataframe has something in it if spec_data.empty: continue attributes = _get_site_attributes(site=site, inlet=inlet_label, instrument=instrument, gc_params=gc_params) attributes = attributes.copy() # We want an xarray Dataset spec_data = spec_data.to_xarray() # Create a standardised / cleaned species label try: comp_species = species_translator[spec.upper()]["chem"] except KeyError: comp_species = clean_string(spec.lower()) # Add the cleaned species name to the metadata and alternative name if present spec_metadata["species"] = comp_species if comp_species != spec.lower() and comp_species != spec.upper(): spec_metadata["species_alt"] = spec # Rename variables so they have lowercase and alphanumeric names to_rename = {} for var in spec_data.variables: if spec in var: new_name = var.replace(spec, comp_species) to_rename[var] = new_name spec_data = spec_data.rename(to_rename) # As a single species may have measurements from multiple inlets we # use the species and inlet as a key data_key = f"{comp_species}_{inlet_label}" combined_data[data_key]["metadata"] = spec_metadata combined_data[data_key]["data"] = spec_data combined_data[data_key]["attributes"] = attributes to_return: Dict = combined_data.to_dict() return to_return