def parse_crds( data_filepath: Union[str, Path], site: str, network: str, inlet: Optional[str] = None, instrument: Optional[str] = None, sampling_period: Optional[str] = None, measurement_type: Optional[str] = None, ) -> Dict: """Creates a CRDS object holding data stored within Datasources Args: data_filepath: Path to file site: Three letter site code network: Network name inlet: Inlet height instrument: Instrument name sampling_period: Sampling period e.g. 2 hour: 2H, 2 minute: 2m measurement_type: Measurement type e.g. insitu, flask Returns: dict: Dictionary of gas data """ from pathlib import Path from openghg.standardise.meta import assign_attributes if not isinstance(data_filepath, Path): data_filepath = Path(data_filepath) # This may seem like an almost pointless function as this is all we do # but it makes it a lot easier to test assign_attributes gas_data = _read_data( data_filepath=data_filepath, site=site, network=network, inlet=inlet, instrument=instrument, sampling_period=sampling_period, measurement_type=measurement_type, ) # Ensure the data is CF compliant gas_data = assign_attributes(data=gas_data, site=site, sampling_period=sampling_period) return gas_data
def _read_raw_file( data_filepath: Union[str, Path], site: str, inlet: str, sampling_period: str, measurement_type: str, instrument: Optional[str] = None, ) -> Dict: """Reads NOAA data files and returns a dictionary of processed data and metadata. Args: data_filepath: Path of file to load species: Species name site: Site name Returns: list: UUIDs of Datasources data has been assigned to """ from openghg.standardise.meta import assign_attributes from pathlib import Path data_filepath = Path(data_filepath) filename = data_filepath.name species = filename.split("_")[0].lower() source_name = data_filepath.stem source_name = source_name.split("-")[0] gas_data = _read_raw_data( data_filepath=data_filepath, inlet=inlet, species=species, measurement_type=measurement_type, sampling_period=sampling_period, ) gas_data = assign_attributes(data=gas_data, site=site, network="NOAA") return gas_data
def parse_eurocom( data_filepath: Union[str, Path], site: str, sampling_period: str, network: Optional[str] = None, inlet: Optional[str] = None, instrument: Optional[str] = None, ) -> Dict: """Parses EUROCOM data files into a format expected by OpenGHG Args: data_filepath: Path of file to read site: Site code sampling_period: Sampling period in seconds network: Network name Inlet: Inlet height in metres Instrument: Instrument name Returns: dict: Dictionary of measurement data """ from openghg.standardise.meta import assign_attributes, get_attributes from pandas import read_csv, Timestamp from openghg.util import read_header, load_json data_filepath = Path(data_filepath) if site is None: site = data_filepath.stem.split("_")[0] if sampling_period is None: sampling_period = "NOT_SET" data_filepath = Path(data_filepath) filename = data_filepath.name inlet_height = filename.split("_")[1] if "m" not in inlet_height: inlet_height = "NA" # This dictionary is used to store the gas data and its associated metadata combined_data = {} # Read the header as lines starting with # header = read_header(data_filepath, comment_char="#") n_skip = len(header) - 1 species = "co2" def date_parser(year: str, month: str, day: str, hour: str, minute: str) -> Timestamp: return Timestamp(year=year, month=month, day=day, hour=hour, minute=minute) datetime_columns = {"time": ["Year", "Month", "Day", "Hour", "Minute"]} use_cols = [ "Day", "Month", "Year", "Hour", "Minute", str(species.lower()), "SamplingHeight", "Stdev", "NbPoints", ] dtypes = { "Day": int, "Month": int, "Year": int, "Hour": int, "Minute": int, species.lower(): float, "Stdev": float, "SamplingHeight": float, "NbPoints": int, } data = read_csv( data_filepath, skiprows=n_skip, parse_dates=datetime_columns, date_parser=date_parser, index_col="time", sep=";", usecols=use_cols, dtype=dtypes, na_values="-999.99", ) data = data[data[species.lower()] >= 0.0] data = data.dropna(axis="rows", how="any") # Drop duplicate indices data = data.loc[~data.index.duplicated(keep="first")] # Convert to xarray Dataset data = data.to_xarray() attributes_data = load_json(filename="attributes.json") eurocom_attributes = attributes_data["EUROCOM"] global_attributes = eurocom_attributes["global_attributes"] if inlet_height == "NA": try: inlet = eurocom_attributes["intake_height"][site] global_attributes["inlet_height_m"] = inlet calibration_scale = eurocom_attributes["calibration"][site] except KeyError: calibration_scale = {} raise ValueError( f"Unable to find inlet from filename or attributes file for {site}" ) gas_data = get_attributes( ds=data, species=species, site=site, global_attributes=global_attributes, units="ppm", ) # Create a copy of the metadata dict metadata = {} metadata["site"] = site metadata["species"] = species metadata["inlet_height"] = global_attributes["inlet_height_m"] metadata["calibration_scale"] = calibration_scale metadata["network"] = "EUROCOM" metadata["sampling_period"] = str(sampling_period) combined_data[species] = { "metadata": metadata, "data": gas_data, "attributes": global_attributes, } combined_data = assign_attributes(data=combined_data, site=site, sampling_period=sampling_period) return combined_data
def parse_btt( data_filepath: Union[str, Path], site: Optional[str] = "BTT", network: Optional[str] = "LGHG", inlet: Optional[str] = None, instrument: Optional[str] = None, ) -> Dict: """Reads NPL data files and returns the UUIDS of the Datasources the processed data has been assigned to Args: data_filepath: Path of file to load site: Site name Returns: dict: Dictionary of gas data """ from openghg.standardise.meta import assign_attributes from pandas import read_csv, Timestamp, to_timedelta, isnull from numpy import nan as np_nan from openghg.util import clean_string, load_json # TODO: Decide what to do about inputs which aren't use anywhere # at present - inlet, instrument, sampling_period, measurement_type data_filepath = Path(data_filepath) site = "BTT" # Rename these columns rename_dict = {"co2.cal": "CO2", "ch4.cal.ppb": "CH4"} # We only want these species species_extract = ["CO2", "CH4"] # Take std-dev measurements from these columns for these species species_sd = {"CO2": "co2.sd.ppm", "CH4": "ch4.sd.ppb"} param_data = load_json(filename="attributes.json") network_params = param_data["BTT"] sampling_period = int(network_params["sampling_period"]) sampling_period_seconds = str(sampling_period) + "s" data = read_csv(data_filepath) data["time"] = Timestamp("2019-01-01 00:00") + to_timedelta( data["DOY"] - 1, unit="D") data["time"] = data["time"].dt.round(sampling_period_seconds) data = data[~isnull(data.time)] data = data.rename(columns=rename_dict) data = data.set_index("time") gas_data = {} for species in species_extract: processed_data = data.loc[:, [species]].sort_index() # Create a variability column species_stddev_label = species_sd[species] processed_data[species][f"{species} variability"] = data[ species_stddev_label] # Replace any values below zero with NaNs processed_data[processed_data < 0] = np_nan # Drop NaNs processed_data = processed_data.dropna() # Convert to a Dataset processed_data = processed_data.to_xarray() site_attributes = network_params["global_attributes"] site_attributes["inlet_height_magl"] = network_params["inlet"] site_attributes["instrument"] = network_params["instrument"] site_attributes["sampling_period"] = sampling_period # TODO - add in better metadata reading metadata = { "species": clean_string(species), "sampling_period": str(sampling_period), "site": "BTT", } gas_data[species] = { "metadata": metadata, "data": processed_data, "attributes": site_attributes, } gas_data = assign_attributes(data=gas_data, site=site, network=network) return gas_data
def _read_obspack( data_filepath: Union[str, Path], site: str, inlet: str, sampling_period: str, measurement_type: str, instrument: Optional[str] = None, ) -> Dict[str, Dict]: """Read NOAA ObsPack NetCDF files Args: data_filepath: Path to file site: Three letter site code inlet: Inlet height, if no height use measurement type e.g. flask measurement_type: One of flask, insity or pfp instrument: Instrument name sampling_period: Sampling period Returns: dict: Dictionary of results """ import xarray as xr from openghg.util import clean_string from openghg.standardise.meta import assign_attributes valid_types = ("flask", "insitu", "pfp") if measurement_type not in valid_types: raise ValueError(f"measurement_type must be one of {valid_types}") obspack_ds = xr.open_dataset(data_filepath) # orig_attrs = obspack_ds.attrs # Want to find and drop any duplicate time values for the original dataset # Using xarray directly we have to do in a slightly convoluted way as this is not well built # into the xarray workflow yet - https://github.com/pydata/xarray/pull/5239 # - can use da.drop_duplicates() but only on one variable at a time and not on the whole Dataset # This method keeps attributes for each of the variables including units # The dimension within the original dataset is called "obs" and has no associated coordinates # Extract time from original Dataset (dimension is "obs") time = obspack_ds.time # To keep associated "obs" dimension, need to assign coordinate values to this (just 0, len(obs)) time = time.assign_coords(obs=obspack_ds.obs) # Make "time" the primary dimension (while retaining "obs") and add "time" values as coordinates time = time.swap_dims(dims_dict={"obs": "time"}) time = time.assign_coords(time=time) # Drop any duplicate time values and extract the associated "obs" values # TODO: Work out what to do with duplicates - may be genuine multiple measurements time_unique = time.drop_duplicates(dim="time", keep="first") obs_unique = time_unique.obs # Estimate sampling period using metadata and midpoint time if sampling_period == "NOT_SET": sampling_period_estimate = _estimate_sampling_period(obspack_ds) else: sampling_period_estimate = -1.0 species = clean_string(obspack_ds.attrs["dataset_parameter"]) network = "NOAA" # Use these obs values to filter the original dataset to remove any repeated times processed_ds = obspack_ds.sel(obs=obs_unique) processed_ds = processed_ds.set_coords(["time"]) # Rename variables to match our internal standard # "value_std_dev" --> f"{species}_variability" # "value_unc" --> ?? # TODO: Clarify what "value_unc" should be renamed to variable_names = { "value": species, "value_std_dev": f"{species}_variability", "value_unc": f"{species}_variability", # May need to be updated "nvalue": f"{species}_number_of_observations" } to_extract = [ name for name in variable_names.keys() if name in processed_ds ] name_dict = { name: key for name, key in variable_names.items() if name in to_extract } if not to_extract: wanted = variable_names.keys() raise ValueError( f"No valid data columns found in converted DataFrame. We expect the following data variables in the passed NetCDF: {wanted}" ) processed_ds = processed_ds[to_extract] processed_ds = processed_ds.rename(name_dict) processed_ds = processed_ds.sortby("time") try: # Extract units attribute from value data variable units = processed_ds[species].units except (KeyError, AttributeError): print("Unable to extract units from 'value' within input dataset") else: if units == "mol mol-1": units = "1" elif units == "millimol mol-1": units = "1e-3" elif units == "micromol mol-1": units = "1e-6" elif units == "nmol mol-1": units = "1e-9" elif units == "pmol mol-1": units = "1e-12" else: print(f"Using unit {units} directly") # raise ValueError(f"Did not recognise input units from file: {units}") metadata = {} metadata["site"] = site metadata["inlet"] = inlet metadata["network"] = network metadata["measurement_type"] = measurement_type metadata["species"] = species metadata["units"] = units metadata["sampling_period"] = sampling_period if instrument is not None: metadata["instrument"] = instrument else: try: metadata["instrument"] = obspack_ds.attrs["instrument"] except KeyError: pass if sampling_period_estimate >= 0.0: metadata["sampling_period_estimate"] = str( sampling_period_estimate ) # convert to string to keep consistent with "sampling_period" # TODO: At the moment all attributes from the NOAA ObsPack are being copied # plus any variables we're adding. # - decide if we want to reduce this attributes = obspack_ds.attrs attributes["sampling_period"] = sampling_period if sampling_period_estimate >= 0.0: attributes["sampling_period_estimate"] = str(sampling_period_estimate) gas_data = { species: { "data": processed_ds, "metadata": metadata, "attributes": attributes } } gas_data = assign_attributes(data=gas_data, site=site, network=network) return gas_data
def parse_npl( data_filepath: pathType, site: str = "NPL", network: str = "LGHG", inlet: str = None, instrument: str = None, sampling_period: str = None, measurement_type: str = None, ) -> Dict: """Reads NPL data files and returns the UUIDS of the Datasources the processed data has been assigned to Args: data_filepath: Path of file to load site: Site name Returns: list: UUIDs of Datasources data has been assigned to """ if sampling_period is None: sampling_period = "NOT_SET" data_filepath = Path(data_filepath) site = "NPL" attributes_data = load_json(filename="attributes.json") npl_params = attributes_data["NPL"] # mypy doesn't like NaT or NaNs - look into this def parser(date: str): # type: ignore try: return datetime.strptime(str(date), "%d/%m/%Y %H:%M") except ValueError: return NaT data = read_csv(data_filepath, index_col=0, date_parser=parser) # Drop the NaT/NaNs data = data.loc[data.index.dropna()] # Rename columns rename_dict = {"Cal_CO2_dry": "CO2", "Cal_CH4_dry": "CH4"} data = data.rename(columns=rename_dict) data.index.name = "time" if inlet is None: inlet = "NA" gas_data = {} for species in data.columns: processed_data = data.loc[:, [species]].sort_index().to_xarray() # Convert methane to ppb if species == "CH4": processed_data[species] *= 1000 # No averaging applied to raw obs, set variability to 0 to allow get_obs to calculate # when averaging processed_data["{} variability".format( species)] = processed_data[species] * 0.0 site_attributes = npl_params["global_attributes"] site_attributes["inlet_height_magl"] = npl_params["inlet"] site_attributes["instrument"] = npl_params["instrument"] metadata = { "species": clean_string(species), "sampling_period": str(sampling_period), "site": "NPL", "network": "LGHG", "inlet": inlet, } # TODO - add in better metadata reading gas_data[species] = { "metadata": metadata, "data": processed_data, "attributes": site_attributes, } gas_data = assign_attributes(data=gas_data, site=site, network=network) return gas_data
def parse_gcwerks( data_filepath: Union[str, Path], precision_filepath: Union[str, Path], site: str, network: str, inlet: Optional[str] = None, instrument: Optional[str] = None, sampling_period: Optional[str] = None, measurement_type: Optional[str] = None, ) -> Dict: """Reads a GC data file by creating a GC object and associated datasources Args: data_filepath: Path of data file precision_filepath: Path of precision file site: Three letter code or name for site instrument: Instrument name network: Network name Returns: dict: Dictionary of source_name : UUIDs """ from pathlib import Path from openghg.standardise.meta import assign_attributes from openghg.util import clean_string, load_json data_filepath = Path(data_filepath) precision_filepath = Path(precision_filepath) # Do some setup for processing # Load site data gcwerks_data = load_json(filename="process_gcwerks_parameters.json") gc_params = gcwerks_data["GCWERKS"] network = clean_string(network) # We don't currently do anything with inlet here as it's always read from data # or taken from process_gcwerks_parameters.json if inlet is not None: inlet = clean_string(inlet) if instrument is not None: instrument = clean_string(instrument) # Check if the site code passed matches that read from the filename site = _check_site( filepath=data_filepath, site_code=site, gc_params=gc_params, ) # If we're not passed the instrument name and we can't find it raise an error if instrument is None: instrument = _check_instrument(filepath=data_filepath, gc_params=gc_params, should_raise=True) else: fname_instrument = _check_instrument(filepath=data_filepath, gc_params=gc_params, should_raise=False) if fname_instrument is not None and instrument != fname_instrument: raise ValueError( f"Mismatch between instrument passed as argument {instrument} and instrument read from filename {fname_instrument}" ) instrument = str(instrument) gas_data = _read_data( data_filepath=data_filepath, precision_filepath=precision_filepath, site=site, instrument=instrument, network=network, sampling_period=sampling_period, gc_params=gc_params, ) # Assign attributes to the data for CF compliant NetCDFs gas_data = assign_attributes(data=gas_data, site=site) return gas_data