示例#1
0
    def read_file(
        filepath: Union[str, Path],
        species: str,
        source: str,
        domain: str,
        date: str,
        high_time_resolution: Optional[bool] = False,
        period: Optional[str] = None,
        overwrite: bool = False,
    ) -> Dict:
        """Read emissions file

        Args:
            filepath: Path of emissions file
            species: Species name
            domain: Emissions domain
            source: Emissions source
            high_time_resolution: If this is a high resolution file
            period: Period of measurements, if not passed this is inferred from the time coords
            overwrite: Should this data overwrite currently stored data.
        """
        from collections import defaultdict
        from xarray import open_dataset
        from openghg.store import assign_data
        from openghg.util import (
            clean_string,
            hash_file,
            timestamp_tzaware,
            timestamp_now,
        )

        species = clean_string(species)
        source = clean_string(source)
        domain = clean_string(domain)
        date = clean_string(date)

        filepath = Path(filepath)

        em_store = Emissions.load()

        file_hash = hash_file(filepath=filepath)
        if file_hash in em_store._file_hashes and not overwrite:
            raise ValueError(
                f"This file has been uploaded previously with the filename : {em_store._file_hashes[file_hash]}."
            )

        em_data = open_dataset(filepath)

        # Some attributes are numpy types we can't serialise to JSON so convert them
        # to their native types here
        attrs = {}
        for key, value in em_data.attrs.items():
            try:
                attrs[key] = value.item()
            except AttributeError:
                attrs[key] = value

        author_name = "OpenGHG Cloud"
        em_data.attrs["author"] = author_name

        metadata = {}
        metadata.update(attrs)

        metadata["species"] = species
        metadata["domain"] = domain
        metadata["source"] = source
        metadata["date"] = date
        metadata["author"] = author_name
        metadata["processed"] = str(timestamp_now())

        metadata["start_date"] = str(timestamp_tzaware(em_data.time[0].values))
        metadata["end_date"] = str(timestamp_tzaware(em_data.time[-1].values))

        metadata["max_longitude"] = round(float(em_data["lon"].max()), 5)
        metadata["min_longitude"] = round(float(em_data["lon"].min()), 5)
        metadata["max_latitude"] = round(float(em_data["lat"].max()), 5)
        metadata["min_latitude"] = round(float(em_data["lat"].min()), 5)

        metadata["time_resolution"] = "high" if high_time_resolution else "standard"

        if period is not None:
            metadata["time_period"] = period

        key = "_".join((species, source, domain, date))

        emissions_data: DefaultDict[str, Dict[str, Union[Dict, Dataset]]] = defaultdict(dict)
        emissions_data[key]["data"] = em_data
        emissions_data[key]["metadata"] = metadata

        keyed_metadata = {key: metadata}

        lookup_results = em_store.datasource_lookup(metadata=keyed_metadata)

        data_type = "emissions"
        datasource_uuids = assign_data(
            data_dict=emissions_data,
            lookup_results=lookup_results,
            overwrite=overwrite,
            data_type=data_type,
        )

        em_store.add_datasources(datasource_uuids=datasource_uuids, metadata=keyed_metadata)

        # Record the file hash in case we see this file again
        em_store._file_hashes[file_hash] = filepath.name

        em_store.save()

        return datasource_uuids
示例#2
0
    def read_file(
        filepath: Union[str, Path],
        model: str,
        species: str,
        start_date: Optional[str] = None,
        end_date: Optional[str] = None,
        setup: Optional[str] = None,
        overwrite: bool = False,
    ) -> Dict:
        """Read Eulerian model output

        Args:
            filepath: Path of Eulerian model species output
            model: Eulerian model name
            species: Species name
            start_date: Start date (inclusive) associated with model run
            end_date: End date (exclusive) associated with model run
            setup: Additional setup details for run
            overwrite: Should this data overwrite currently stored data.
        """
        # TODO: As written, this currently includes some light assumptions that we're dealing with GEOSChem SpeciesConc format.
        # May need to split out into multiple modules (like with ObsSurface) or into separate retrieve functions as needed.

        from collections import defaultdict
        from openghg.util import (
            clean_string,
            hash_file,
            timestamp_now,
            timestamp_tzaware,
        )
        from openghg.store import assign_data
        from xarray import open_dataset
        from pandas import Timestamp as pd_Timestamp

        model = clean_string(model)
        species = clean_string(species)
        start_date = clean_string(start_date)
        end_date = clean_string(end_date)
        setup = clean_string(setup)

        filepath = Path(filepath)

        em_store = EulerianModel.load()

        file_hash = hash_file(filepath=filepath)
        if file_hash in em_store._file_hashes and not overwrite:
            raise ValueError(
                f"This file has been uploaded previously with the filename : {em_store._file_hashes[file_hash]}."
            )

        em_data = open_dataset(filepath)

        # Check necessary 4D coordinates are present and rename if necessary (for consistency)
        check_coords = {
            "time": ["time"],
            "lat": ["lat", "latitude"],
            "lon": ["lon", "longitude"],
            "lev": ["lev", "level", "layer", "sigma_level"],
        }
        for name, coord_options in check_coords.items():
            for coord in coord_options:
                if coord in em_data.coords:
                    break
            else:
                raise ValueError(
                    "Input data must contain one of '{coord_options}' co-ordinate"
                )
            if name != coord:
                print("Renaming co-ordinate '{coord}' to '{name}'")
                em_data = em_data.rename({coord: name})

        attrs = em_data.attrs

        # author_name = "OpenGHG Cloud"
        # em_data.attrs["author"] = author_name

        metadata = {}
        metadata.update(attrs)

        metadata["model"] = model
        metadata["species"] = species
        metadata["processed"] = str(timestamp_now())

        if start_date is None:
            if len(em_data["time"]) > 1:
                start_date = str(timestamp_tzaware(em_data.time[0].values))
            else:
                try:
                    start_date = attrs["simulation_start_date_and_time"]
                except KeyError:
                    raise Exception(
                        "Unable to derive start_date from data, please provide as an input."
                    )
                else:
                    start_date = timestamp_tzaware(start_date)
                    start_date = str(start_date)

        if end_date is None:
            if len(em_data["time"]) > 1:
                end_date = str(timestamp_tzaware(em_data.time[-1].values))
            else:
                try:
                    end_date = attrs["simulation_end_date_and_time"]
                except KeyError:
                    raise Exception(
                        "Unable to derive `end_date` from data, please provide as an input."
                    )
                else:
                    end_date = timestamp_tzaware(end_date)
                    end_date = str(end_date)

        date = str(pd_Timestamp(start_date).date())

        metadata["date"] = date
        metadata["start_date"] = start_date
        metadata["end_date"] = end_date

        metadata["max_longitude"] = round(float(em_data["lon"].max()), 5)
        metadata["min_longitude"] = round(float(em_data["lon"].min()), 5)
        metadata["max_latitude"] = round(float(em_data["lat"].max()), 5)
        metadata["min_latitude"] = round(float(em_data["lat"].min()), 5)

        history = metadata.get("history")
        if history is None:
            history = ""
        metadata[
            "history"] = history + f" {str(timestamp_now())} Processed onto OpenGHG cloud"

        key = "_".join((model, species, date))

        model_data: DefaultDict[str, Dict[str,
                                          Union[Dict,
                                                Dataset]]] = defaultdict(dict)
        model_data[key]["data"] = em_data
        model_data[key]["metadata"] = metadata

        keyed_metadata = {key: metadata}

        lookup_results = em_store.datasource_lookup(metadata=keyed_metadata)

        data_type = "eulerian_model"
        datasource_uuids = assign_data(
            data_dict=model_data,
            lookup_results=lookup_results,
            overwrite=overwrite,
            data_type=data_type,
        )

        em_store.add_datasources(datasource_uuids=datasource_uuids,
                                 metadata=keyed_metadata)

        # Record the file hash in case we see this file again
        em_store._file_hashes[file_hash] = filepath.name

        em_store.save()

        return datasource_uuids
示例#3
0
    def read_file(
        filepath: Union[str, Path],
        site: str,
        height: str,
        domain: str,
        model: str,
        metmodel: Optional[str] = None,
        species: Optional[str] = None,
        network: Optional[str] = None,
        retrieve_met: bool = False,
        overwrite: bool = False,
        high_res: bool = False,
        # model_params: Optional[Dict] = None,
    ) -> Dict[str, str]:
        """Reads footprints data files and returns the UUIDS of the Datasources
        the processed data has been assigned to

        Args:
            filepath: Path of file to load
            site: Site name
            network: Network name
            height: Height above ground level in metres
            domain: Domain of footprints
            model_params: Model run parameters
            retrieve_met: Whether to also download meterological data for this footprints area
            overwrite: Overwrite any currently stored data
        Returns:
            dict: UUIDs of Datasources data has been assigned to
        """
        from collections import defaultdict
        from xarray import open_dataset
        from openghg.util import (
            hash_file,
            timestamp_tzaware,
            timestamp_now,
            clean_string,
        )
        from openghg.store import assign_data

        filepath = Path(filepath)

        site = clean_string(site)
        network = clean_string(network)
        height = clean_string(height)
        domain = clean_string(domain)

        fp = Footprints.load()

        file_hash = hash_file(filepath=filepath)
        if file_hash in fp._file_hashes and not overwrite:
            raise ValueError(
                f"This file has been uploaded previously with the filename : {fp._file_hashes[file_hash]}."
            )

        fp_data = open_dataset(filepath)

        # Need to read the metadata from the footprints and then store it
        # Do we need to chunk the footprints / will a Datasource store it correctly?
        metadata: Dict[str, Union[str, float, List[float]]] = {}

        metadata["data_type"] = "footprints"
        metadata["site"] = site
        metadata["height"] = height
        metadata["domain"] = domain
        metadata["model"] = model

        if species is not None:
            metadata["species"] = clean_string(species)

        if network is not None:
            metadata["network"] = clean_string(network)

        if metmodel is not None:
            metadata["metmodel"] = clean_string(metmodel)

        metadata["start_date"] = str(timestamp_tzaware(fp_data.time[0].values))
        metadata["end_date"] = str(timestamp_tzaware(fp_data.time[-1].values))

        metadata["max_longitude"] = round(float(fp_data["lon"].max()), 5)
        metadata["min_longitude"] = round(float(fp_data["lon"].min()), 5)
        metadata["max_latitude"] = round(float(fp_data["lat"].max()), 5)
        metadata["min_latitude"] = round(float(fp_data["lat"].min()), 5)
        metadata["time_resolution"] = "standard_time_resolution"

        # If it's a high resolution footprints file we'll have two sets of lat/long values
        if high_res:
            try:
                metadata["max_longitude_high"] = round(
                    float(fp_data["lon_high"].max()), 5)
                metadata["min_longitude_high"] = round(
                    float(fp_data["lon_high"].min()), 5)
                metadata["max_latitude_high"] = round(
                    float(fp_data["lat_high"].max()), 5)
                metadata["min_latitude_high"] = round(
                    float(fp_data["lat_high"].min()), 5)
                metadata["time_resolution"] = "high_time_resolution"
            except KeyError:
                raise KeyError("Unable to find lat_high or lon_high data.")

        metadata["heights"] = [float(h) for h in fp_data.height.values]
        # Do we also need to save all the variables we have available in this footprints?
        metadata["variables"] = list(fp_data.keys())

        # if model_params is not None:
        #     metadata["model_parameters"] = model_params

        # Set the attributes of this Dataset
        fp_data.attrs = {
            "author": "OpenGHG Cloud",
            "processed": str(timestamp_now())
        }

        # This might seem longwinded now but will help when we want to read
        # more than one footprints at a time
        key = "_".join((site, domain, model, height))

        footprint_data: DefaultDict[str,
                                    Dict[str,
                                         Union[Dict,
                                               Dataset]]] = defaultdict(dict)
        footprint_data[key]["data"] = fp_data
        footprint_data[key]["metadata"] = metadata

        # This will be removed when we process multiple files
        keyed_metadata = {key: metadata}

        lookup_results = fp.datasource_lookup(metadata=keyed_metadata)

        data_type = "footprints"
        datasource_uuids: Dict[str, str] = assign_data(
            data_dict=footprint_data,
            lookup_results=lookup_results,
            overwrite=overwrite,
            data_type=data_type,
        )

        fp.add_datasources(datasource_uuids=datasource_uuids,
                           metadata=keyed_metadata)

        # Record the file hash in case we see this file again
        fp._file_hashes[file_hash] = filepath.name

        fp.save()

        return datasource_uuids
示例#4
0
def get_obs_surface(
    site: str,
    species: str,
    inlet: str = None,
    start_date: Optional[Union[str, Timestamp]] = None,
    end_date: Optional[Union[str, Timestamp]] = None,
    average: Optional[str] = None,
    network: Optional[str] = None,
    instrument: Optional[str] = None,
    calibration_scale: Optional[str] = None,
    keep_missing: Optional[bool] = False,
    skip_ranking: Optional[bool] = False,
) -> ObsData:
    """Get measurements from one site.

    Args:
        site: Site of interest e.g. MHD for the Mace Head site.
        species: Species identifier e.g. ch4 for methane.
        start_date: Output start date in a format that Pandas can interpret
        end_date: Output end date in a format that Pandas can interpret
        inlet: Inlet label
        average: Averaging period for each dataset. Each value should be a string of
        the form e.g. "2H", "30min" (should match pandas offset aliases format).
        keep_missing: Keep missing data points or drop them.
        network: Network for the site/instrument (must match number of sites).
        instrument: Specific instrument for the site (must match number of sites).
        calibration_scale: Convert to this calibration scale
    Returns:
        ObsData: ObsData object
    """
    from pandas import Timestamp, Timedelta
    import numpy as np
    from xarray import concat as xr_concat
    from openghg.retrieve import search
    from openghg.store import recombine_datasets
    from openghg.util import clean_string, load_json, timestamp_tzaware

    site_info = load_json(filename="acrg_site_info.json")
    site = site.upper()

    if site not in site_info:
        raise ValueError(
            f"No site called {site}, please enter a valid site name.")

    # Find the correct synonym for the passed species
    species = clean_string(_synonyms(species))

    # Get the observation data
    obs_results = search(
        site=site,
        species=species,
        inlet=inlet,
        start_date=start_date,
        end_date=end_date,
        instrument=instrument,
        find_all=True,
        skip_ranking=skip_ranking,
    )

    if not obs_results:
        raise ValueError(f"Unable to find results for {species} at {site}")

    # TODO - for some reason mypy doesn't pick up the ObsData being returned here, look into this
    # GJ - 2021-07-19
    retrieved_data: ObsData = obs_results.retrieve(site=site,
                                                   species=species,
                                                   inlet=inlet)  # type: ignore
    data = retrieved_data.data

    if data.attrs["inlet"] == "multiple":
        data.attrs["inlet_height_magl"] = "multiple"
        retrieved_data.metadata["inlet"] = "multiple"

    if start_date is not None and end_date is not None:
        start_date_tzaware = timestamp_tzaware(start_date)
        end_date_tzaware = timestamp_tzaware(end_date)
        end_date_tzaware_exclusive = end_date_tzaware - Timedelta(
            1, unit="nanosecond"
        )  # Deduct 1 ns to make the end day (date) exclusive.

        # Slice the data to only cover the dates we're interested in
        data = data.sel(
            time=slice(start_date_tzaware, end_date_tzaware_exclusive))

    try:
        start_date_data = timestamp_tzaware(data.time[0].values)
        end_date_data = timestamp_tzaware(data.time[-1].values)
    except AttributeError:
        raise AttributeError(
            "This dataset does not have a time attribute, unable to read date range"
        )

    if average is not None:
        # GJ - 2021-03-09
        # TODO - check by RT

        # # Average the Dataset over a given period
        # if keep_missing is True:
        #     # Create a dataset with one element and NaNs to prepend or append
        #     ds_single_element = data[{"time": 0}]

        #     for v in ds_single_element.variables:
        #         if v != "time":
        #             ds_single_element[v].values = np.nan

        #     ds_concat = []

        #     # Pad with an empty entry at the start date
        #     if timestamp_tzaware(data.time.min()) > start_date:
        #         ds_single_element_start = ds_single_element.copy()
        #         ds_single_element_start.time.values = Timestamp(start_date)
        #         ds_concat.append(ds_single_element_start)

        #     ds_concat.append(data)

        #     # Pad with an empty entry at the end date
        #     if data.time.max() < Timestamp(end_date):
        #         ds_single_element_end = ds_single_element.copy()
        #         ds_single_element_end.time.values = Timestamp(end_date) - Timedelta("1ns")
        #         ds_concat.append(ds_single_element_end)

        #     data = xr_concat(ds_concat, dim="time")

        #     # Now sort to get everything in the right order
        #     data = data.sortby("time")

        # First do a mean resample on all variables
        ds_resampled = data.resample(time=average).mean(skipna=False,
                                                        keep_attrs=True)
        # keep_attrs doesn't seem to work for some reason, so manually copy
        ds_resampled.attrs = data.attrs.copy()

        average_in_seconds = Timedelta(average).total_seconds()
        ds_resampled.attrs["averaged_period"] = average_in_seconds
        ds_resampled.attrs["averaged_period_str"] = average

        # For some variables, need a different type of resampling
        data_variables: List[str] = [str(v) for v in data.variables]

        for var in data_variables:
            if "repeatability" in var:
                ds_resampled[var] = (np.sqrt(
                    (data[var]**2).resample(time=average).sum()) /
                                     data[var].resample(time=average).count())

            # Copy over some attributes
            if "long_name" in data[var].attrs:
                ds_resampled[var].attrs["long_name"] = data[var].attrs[
                    "long_name"]

            if "units" in data[var].attrs:
                ds_resampled[var].attrs["units"] = data[var].attrs["units"]

        # Create a new variability variable, containing the standard deviation within the resampling period
        ds_resampled[f"{species}_variability"] = (data[species].resample(
            time=average).std(skipna=False, keep_attrs=True))

        # If there are any periods where only one measurement was resampled, just use the median variability
        ds_resampled[f"{species}_variability"][
            ds_resampled[f"{species}_variability"] ==
            0.0] = ds_resampled[f"{species}_variability"].median()

        # Create attributes for variability variable
        ds_resampled[f"{species}_variability"].attrs[
            "long_name"] = f"{data.attrs['long_name']}_variability"

        ds_resampled[f"{species}_variability"].attrs["units"] = data[
            species].attrs["units"]

        # Resampling may introduce NaNs, so remove, if not keep_missing
        if keep_missing is False:
            ds_resampled = ds_resampled.dropna(dim="time")

        data = ds_resampled

    # Rename variables
    rename: Dict[str, str] = {}

    data_variables = [str(v) for v in data.variables]
    for var in data_variables:
        if var.lower() == species.lower():
            rename[var] = "mf"
        if "repeatability" in var:
            rename[var] = "mf_repeatability"
        if "variability" in var:
            rename[var] = "mf_variability"
        if "number_of_observations" in var:
            rename[var] = "mf_number_of_observations"
        if "status_flag" in var:
            rename[var] = "status_flag"
        if "integration_flag" in var:
            rename[var] = "integration_flag"

    data = data.rename_vars(rename)  # type: ignore

    data.attrs["species"] = species

    if "calibration_scale" in data.attrs:
        data.attrs["scale"] = data.attrs.pop("calibration_scale")

    if calibration_scale is not None:
        data = _scale_convert(data, species, calibration_scale)

    metadata = retrieved_data.metadata
    metadata.update(data.attrs)

    obs_data = ObsData(data=data, metadata=metadata)

    # It doesn't make sense to do this now as we've only got a single Dataset
    # # Now check if the units match for each of the observation Datasets
    # units = set((f.data.mf.attrs["units"] for f in obs_files))
    # scales = set((f.data.attrs["scale"] for f in obs_files))

    # if len(units) > 1:
    #     raise ValueError(
    #         f"Units do not match for these observation Datasets {[(f.mf.attrs['station_long_name'],f.attrs['units']) for f in obs_files]}"
    #     )

    # if len(scales) > 1:
    #     print(
    #         f"Scales do not match for these observation Datasets {[(f.mf.attrs['station_long_name'],f.attrs['units']) for f in obs_files]}"
    #     )
    #     print("Suggestion: set calibration_scale to convert scales")

    return obs_data
示例#5
0
    def retrieve(
        self, site: str = None, species: str = None, inlet: str = None
    ) -> Union[Dict[str, ObsData], ObsData]:
        """Retrieve some or all of the data found in the object store.

        Args:
            site: Three letter site code
            species: Species name
        Returns:
            ObsData or dict
        """
        site = clean_string(site)
        species = clean_string(species)
        inlet = clean_string(inlet)

        # If inlet is not specified, check if this is unambiguous
        # If so, set inlet to be the only value and continue.
        if inlet is None:
            try:
                potential_inlets = self.results[site][species].keys()
            except KeyError:
                pass
            else:
                if len(potential_inlets) == 1:
                    inlet = list(potential_inlets)[0]

        if self.ranked_data:
            if all((site, species, inlet)):
                # TODO - how to do this in a cleaner way?
                site = str(site)
                species = str(species)
                inlet = str(inlet)
                return self._create_obsdata(site=site, species=species, inlet=inlet)

            results = {}
            if site is not None and species is not None:
                try:
                    _ = self.results[site][species]["keys"]
                except KeyError:
                    raise KeyError(f"Unable to find data keys for {species} at {site}.")

                return self._create_obsdata(site=site, species=species)

            # Get the data for all the species at that site
            if site is not None and species is None:
                for sp in self.results[site]:
                    key = "_".join((site, sp))
                    results[key] = self._create_obsdata(site=site, species=sp)

                return results

            # Get the data for all the species at that site
            if site is None and species is not None:
                for a_site in self.results:
                    key = "_".join((a_site, species))

                    try:
                        results[key] = self._create_obsdata(site=a_site, species=species)
                    except KeyError:
                        pass

                return results

            for a_site, species_list in self.results.items():
                for sp in species_list:
                    key = "_".join((a_site, sp))
                    results[key] = self._create_obsdata(site=a_site, species=sp)

            return results
        else:
            # if len(self.results) == 1 and not all((species, inlet)):
            #     raise ValueError("Please pass species and inlet")
            if not all((species, site, inlet)):
                raise ValueError("Please pass site, species and inlet")

            # TODO - how to do this in a cleaner way for mypy?
            site = str(site)
            species = str(species)
            inlet = str(inlet)
            return self._create_obsdata(site=site, species=species, inlet=inlet)
示例#6
0
文件: _crds.py 项目: openghg/openghg
def _read_data(
    data_filepath: Path,
    site: str,
    network: str,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    sampling_period: Optional[str] = None,
    measurement_type: Optional[str] = None,
) -> Dict:
    """Read the datafile passed in and extract the data we require.

    Args:
        data_filepath: Path to file
        site: Three letter site code
        network: Network name
        inlet: Inlet height
        instrument: Instrument name
        sampling_period: Sampling period including the unit (using pandas frequency aliases like '1H' or '1min')
        measurement_type: Measurement type e.g. insitu, flask
    Returns:
        dict: Dictionary of gas data
    """
    from datetime import datetime
    from pandas import RangeIndex, read_csv, NaT
    import warnings
    from openghg.util import clean_string

    split_fname = data_filepath.stem.split(".")
    site = site.lower()

    try:
        site_fname = clean_string(split_fname[0])
        inlet_fname = clean_string(split_fname[3])
    except IndexError:
        raise ValueError(
            "Error reading metadata from filename, we expect a form hfd.picarro.1minute.100m.dat"
        )

    if site_fname != site:
        raise ValueError(
            "Site mismatch between passed site code and that read from filename."
        )

    if "m" not in inlet_fname:
        raise ValueError(
            "No inlet found, we expect filenames such as: bsd.picarro.1minute.108m.dat"
        )

    if inlet is not None and inlet != inlet_fname:
        raise ValueError(
            "Inlet mismatch between passed inlet and that read from filename.")
    else:
        inlet = inlet_fname

    # Function to parse the datetime format found in the datafile
    def parse_date(date: str):  # type: ignore
        try:
            return datetime.strptime(date, "%y%m%d %H%M%S")
        except ValueError:
            return NaT

    # Catch dtype warnings
    # TODO - look at setting dtypes - read header and data separately?
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        data = read_csv(
            data_filepath,
            header=None,
            skiprows=1,
            sep=r"\s+",
            index_col=["0_1"],
            parse_dates=[[0, 1]],
            date_parser=parse_date,
        )

    data.index.name = "time"

    # Drop any rows with NaNs
    # This is now done before creating metadata
    data = data.dropna(axis="rows", how="any")

    # Get the number of gases in dataframe and number of columns of data present for each gas
    n_gases, n_cols = _gas_info(data=data)

    header = data.head(2)
    skip_cols = sum([header[column][0] == "-" for column in header.columns])

    metadata = _read_metadata(filepath=data_filepath, data=data)

    if network is not None:
        metadata["network"] = network

    if sampling_period is not None:
        # Compare against value extracted from the file name
        file_sampling_period = Timedelta(seconds=metadata["sampling_period"])

        comparison_seconds = abs(sampling_period -
                                 file_sampling_period).total_seconds()
        tolerance_seconds = 1

        if comparison_seconds > tolerance_seconds:
            raise ValueError(
                f"Input sampling period {sampling_period} does not match to value "
                f"extracted from the file name of {metadata['sampling_period']} seconds."
            )

    # Read the scale from JSON
    # I'll leave this here for the possible future movement from class to functions
    network_metadata = load_json(filename="process_gcwerks_parameters.json")
    crds_metadata = network_metadata["CRDS"]

    # This dictionary is used to store the gas data and its associated metadata
    combined_data = {}

    for n in range(n_gases):
        # Slice the columns
        gas_data = data.iloc[:, skip_cols + n * n_cols:skip_cols +
                             (n + 1) * n_cols]

        # Reset the column numbers
        gas_data.columns = RangeIndex(gas_data.columns.size)
        species = gas_data[0][0]
        species = species.lower()

        column_labels = [
            species,
            f"{species}_variability",
            f"{species}_number_of_observations",
        ]

        # Name columns
        gas_data = gas_data.set_axis(column_labels,
                                     axis="columns",
                                     inplace=False)

        header_rows = 2
        # Drop the first two rows now we have the name
        gas_data = gas_data.drop(index=gas_data.head(header_rows).index,
                                 inplace=False)
        # Cast data to float64 / double
        gas_data = gas_data.astype("float64")

        # Here we can convert the Dataframe to a Dataset and then write the attributes
        gas_data = gas_data.to_xarray()

        site_attributes = _get_site_attributes(site=site,
                                               inlet=inlet,
                                               crds_metadata=crds_metadata)

        scale = crds_metadata["default_scales"].get(species.upper(), "NA")

        # Create a copy of the metadata dict
        species_metadata = metadata.copy()
        species_metadata["species"] = clean_string(species)
        species_metadata["inlet"] = inlet
        species_metadata["scale"] = scale
        species_metadata["long_name"] = site_attributes["long_name"]

        combined_data[species] = {
            "metadata": species_metadata,
            "data": gas_data,
            "attributes": site_attributes,
        }

    return combined_data
示例#7
0
def get_attributes(
    ds: Dataset,
    species: str,
    site: str,
    network: str = None,
    global_attributes: Dict[str, str] = None,
    units: str = None,
    scale: str = None,
    sampling_period: str = None,
    date_range: List[str] = None,
) -> Dataset:
    """
    This function writes attributes to an xarray.Dataset so that they conform with
    the CF Convention v1.6

    Attributes of the xarray DataSet are modified, and variable names are changed

    If the species is a standard mole fraction then either:
        - species name will used in lower case in the file and variable names
            but with any hyphens taken out
        - name will be changed according to the species_translator dictionary

    If the species is isotopic data or a non-standard variable (e.g. APO):
        - Isotopes species names should begin with a "D"
            (Annoyingly, the code currently picks up "Desflurane" too. I've
             fixed this for now, but if we get a lot of other "D" species, we
             should make this better)
        - I suggest naming for isotopologues should be d<species><isotope>, e.g.
            dCH4C13, or dCO2C14
        - Any non-standard variables should be listed in the species_translator
            dictionary

    Args:
        ds: Should contain variables such as "ch4", "ch4 repeatability".
            Must have a "time" dimension.
        species: Species name. e.g. "CH4", "HFC-134a", "dCH4C13"
        site: Three-letter site code
        network: Network site is associated with
        global_attribuates: Dictionary containing any info you want to
            add to the file header (e.g. {"Contact": "Contact_Name"})
        units: This routine will try to guess the units
            unless this is specified. Options are in units_interpret
        scale: Calibration scale for species.
        sampling_period: Number of seconds for which air
            sample is taken. Only for time variable attribute
        date_range: Start and end date for output
            If you only want an end date, just put a very early start date
            (e.g. ["1900-01-01", "2010-01-01"])
    """
    from pandas import Timestamp as pd_Timestamp
    from openghg.util import clean_string, load_json, timestamp_now

    # from numpy import unique as np_unique

    if not isinstance(ds, Dataset):
        raise TypeError("This function only accepts xarray Datasets")

    # Current CF Conventions (v1.7) demand that valid variable names
    # begin with a letter and be composed of letters, digits and underscores
    # Here variable names are also made lowercase to enable easier matching below

    # TODO - could I just cast ds.variables as as type for mypy instead of doing this?
    # variable_names = [str(v) for v in ds.variables]
    # Is this better?
    variable_names = cast(Dict[str, Any], ds.variables)
    to_underscores = {var: var.lower().replace(" ", "_") for var in variable_names}
    ds = ds.rename(to_underscores)  # type: ignore

    species_attrs = load_json(filename="species_attributes.json")
    attributes_data = load_json("attributes.json")

    species_translator = attributes_data["species_translation"]
    unit_species = attributes_data["unit_species"]
    unit_species_long = attributes_data["unit_species_long"]
    unit_interpret = attributes_data["unit_interpret"]

    species_upper = species.upper()
    species_lower = species.lower()

    variable_names = cast(Dict[str, Any], ds.variables)
    matched_keys = [var for var in variable_names if species_lower in var]

    # If we don't have any variables to rename, raise an error
    if not matched_keys:
        raise NameError(f"Cannot find species {species} in Dataset variables")

    species_rename = {}
    for var in matched_keys:
        try:
            species_label = species_translator[species_upper]["chem"]
        except KeyError:
            species_label = clean_string(species_lower)

        species_rename[var] = var.replace(species_lower, species_label)

    ds = ds.rename(species_rename)  # type: ignore

    # Global attributes
    global_attributes_default = {
        "conditions_of_use": "Ensure that you contact the data owner at the outset of your project.",
        "source": "In situ measurements of air",
        "Conventions": "CF-1.6",
    }

    if global_attributes is not None:
        # TODO - for some reason mypy doesn't see a Dict[str,str] as a valid Mapping[Hashable, Any] type
        global_attributes.update(global_attributes_default)  # type: ignore
    else:
        global_attributes = global_attributes_default

    global_attributes["file_created"] = str(timestamp_now())
    global_attributes["processed_by"] = "OpenGHG_Cloud"
    global_attributes["species"] = species_label

    if scale is None:
        global_attributes["calibration_scale"] = "unknown"
    else:
        global_attributes["calibration_scale"] = scale

    # Update the Dataset attributes
    ds.attrs.update(global_attributes)  # type: ignore

    # Add some site attributes
    site_attributes = _site_info_attributes(site.upper(), network)
    ds.attrs.update(site_attributes)

    # Species-specific attributes
    # Long name
    if species_upper.startswith("D") and species_upper != "DESFLURANE" or species_upper == "APD":
        sp_long = species_translator[species_upper]["name"]
    elif species_upper == "RN":
        sp_long = "radioactivity_concentration_of_222Rn_in_air"
    elif species_upper in species_translator:
        name = species_translator[species_upper]["name"]
        sp_long = f"mole_fraction_of_{name}_in_air"
    else:
        sp_long = f"mole_fraction_of_{species_label}_in_air"

    ancillary_variables = []

    variable_names = cast(Dict[str, Any], ds.variables)
    matched_keys = [var for var in variable_names if species_lower in var.lower()]

    # Write units as attributes to variables containing any of these
    match_words = ["variability", "repeatability", "stdev", "count"]

    for key in variable_names:
        key = key.lower()

        if species_label.lower() in key:
            # Standard name attribute
            # ds[key].attrs["standard_name"]=key.replace(species_label, sp_long)
            ds[key].attrs["long_name"] = key.replace(species_label, sp_long)

            # If units are required for variable, add attribute
            if key == species_label or any(word in key for word in match_words):
                if units is not None:
                    if units in unit_interpret:
                        ds[key].attrs["units"] = unit_interpret[units]
                    else:
                        ds[key].attrs["units"] = unit_interpret["else"]
                else:
                    # TODO - merge these species attributes into a single simpler JSON
                    try:
                        ds[key].attrs["units"] = unit_species[species_upper]
                    except KeyError:
                        try:
                            ds[key].attrs["units"] = species_attrs[species_label.upper()]["units"]
                        except KeyError:
                            ds[key].attrs["units"] = "NA"

                # If units are non-standard, add explanation
                if species_upper in unit_species_long:
                    ds[key].attrs["units_description"] = unit_species_long[species_upper]

            # Add to list of ancilliary variables
            if key != species_label:
                ancillary_variables.append(key)

    # TODO - for the moment skip this step - check status of ancilliary variables in standard
    # Write ancilliary variable list
    # ds[species_label].attrs["ancilliary_variables"] = ", ".join(ancillary_variables)

    # Add quality flag attributes
    # NOTE - I've removed the whitespace before status_flag and integration_flag here
    variable_names = cast(Dict[str, Any], ds.variables)
    quality_flags = [key for key in variable_names if "status_flag" in key]

    # Not getting long_name for c2f6

    for key in quality_flags:
        ds[key] = ds[key].astype(int)
        try:
            long_name = ds[species_label].attrs["long_name"]
        except KeyError:
            raise KeyError(key, quality_flags)

        ds[key].attrs = {
            "flag_meaning": "0 = unflagged, 1 = flagged",
            "long_name": f"{long_name} status_flag",
        }

    variable_names = cast(Dict[str, Any], ds.variables)
    # Add integration flag attributes
    integration_flags = [key for key in variable_names if "integration_flag" in key]

    for key in integration_flags:
        ds[key] = ds[key].astype(int)
        long_name = ds[species_label].attrs["long_name"]
        ds[key].attrs = {
            "flag_meaning": "0 = area, 1 = height",
            "standard_name": f"{long_name} integration_flag",
            "comment": "GC peak integration method (by height or by area). Does not indicate data quality",
        }

    # Set time encoding
    # Check if there are duplicate time stamps

    # I feel there should be a more pandas way of doing this
    # but xarray doesn't currently have a duplicates method
    # See this https://github.com/pydata/xarray/issues/2108

    # if len(set(ds.time.values)) < len(ds.time.values):
    # if len(np_unique(ds.time.values)) < len(ds.time.values):
    #     print("WARNING. Duplicate time stamps")
    first_year = pd_Timestamp(str(ds.time[0].values)).year

    ds.time.encoding = {"units": f"seconds since {str(first_year)}-01-01 00:00:00"}

    time_attributes: Dict[str, str] = {}
    time_attributes["label"] = "left"
    time_attributes["standard_name"] = "time"
    time_attributes["comment"] = (
        "Time stamp corresponds to beginning of sampling period. "
        + "Time since midnight UTC of reference date. "
        + "Note that sampling periods are approximate."
    )

    if sampling_period is not None:
        time_attributes["sampling_period_seconds"] = sampling_period

    ds.time.attrs.update(time_attributes)

    # If a date range is specified, slice dataset
    if date_range:
        ds = ds.loc[dict(time=slice(*date_range))]

    return ds
示例#8
0
def parse_gcwerks(
    data_filepath: Union[str, Path],
    precision_filepath: Union[str, Path],
    site: str,
    network: str,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    sampling_period: Optional[str] = None,
    measurement_type: Optional[str] = None,
) -> Dict:
    """Reads a GC data file by creating a GC object and associated datasources

    Args:
        data_filepath: Path of data file
        precision_filepath: Path of precision file
        site: Three letter code or name for site
        instrument: Instrument name
        network: Network name
    Returns:
        dict: Dictionary of source_name : UUIDs
    """
    from pathlib import Path
    from openghg.standardise.meta import assign_attributes
    from openghg.util import clean_string, load_json

    data_filepath = Path(data_filepath)
    precision_filepath = Path(precision_filepath)

    # Do some setup for processing
    # Load site data
    gcwerks_data = load_json(filename="process_gcwerks_parameters.json")
    gc_params = gcwerks_data["GCWERKS"]

    network = clean_string(network)
    # We don't currently do anything with inlet here as it's always read from data
    # or taken from process_gcwerks_parameters.json
    if inlet is not None:
        inlet = clean_string(inlet)
    if instrument is not None:
        instrument = clean_string(instrument)

    # Check if the site code passed matches that read from the filename
    site = _check_site(
        filepath=data_filepath,
        site_code=site,
        gc_params=gc_params,
    )

    # If we're not passed the instrument name and we can't find it raise an error
    if instrument is None:
        instrument = _check_instrument(filepath=data_filepath,
                                       gc_params=gc_params,
                                       should_raise=True)
    else:
        fname_instrument = _check_instrument(filepath=data_filepath,
                                             gc_params=gc_params,
                                             should_raise=False)

        if fname_instrument is not None and instrument != fname_instrument:
            raise ValueError(
                f"Mismatch between instrument passed as argument {instrument} and instrument read from filename {fname_instrument}"
            )

    instrument = str(instrument)

    gas_data = _read_data(
        data_filepath=data_filepath,
        precision_filepath=precision_filepath,
        site=site,
        instrument=instrument,
        network=network,
        sampling_period=sampling_period,
        gc_params=gc_params,
    )

    # Assign attributes to the data for CF compliant NetCDFs
    gas_data = assign_attributes(data=gas_data, site=site)

    return gas_data
示例#9
0
def _read_raw_data(
    data_filepath: Path,
    species: str,
    inlet: str,
    sampling_period: str,
    measurement_type: str = "flask",
) -> Dict:
    """Separates the gases stored in the dataframe in
    separate dataframes and returns a dictionary of gases
    with an assigned UUID as gas:UUID and a list of the processed
    dataframes

    Args:
        data_filepath: Path of datafile
        species: Species string such as CH4, CO
        measurement_type: Type of measurements e.g. flask
    Returns:
        dict: Dictionary containing attributes, data and metadata keys
    """
    from openghg.util import clean_string, read_header, load_json
    from pandas import read_csv, Timestamp

    header = read_header(filepath=data_filepath)

    column_names = header[-1][14:].split()

    def date_parser(year: str, month: str, day: str, hour: str, minute: str,
                    second: str) -> Timestamp:
        return Timestamp(year, month, day, hour, minute, second)

    date_parsing = {
        "time": [
            "sample_year",
            "sample_month",
            "sample_day",
            "sample_hour",
            "sample_minute",
            "sample_seconds",
        ]
    }

    data_types = {
        "sample_year": int,
        "sample_month": int,
        "sample_day": int,
        "sample_hour": int,
        "sample_minute": int,
        "sample_seconds": int,
    }

    # Number of header lines to skip
    n_skip = len(header)

    data = read_csv(
        data_filepath,
        skiprows=n_skip,
        names=column_names,
        sep=r"\s+",
        dtype=data_types,
        parse_dates=date_parsing,
        date_parser=date_parser,
        index_col="time",
        skipinitialspace=True,
    )

    # Drop duplicates
    data = data.loc[~data.index.duplicated(keep="first")]

    # Check if the index is sorted
    if not data.index.is_monotonic_increasing:
        data = data.sort_index()

    # Read the site code from the Dataframe
    site = str(data["sample_site_code"][0]).upper()

    site_data = load_json("acrg_site_info.json")
    # If this isn't a site we recognize try and read it from the filename
    if site not in site_data:
        site = str(data_filepath.name).split("_")[1].upper()

        if site not in site_data:
            raise ValueError(f"The site {site} is not recognized.")

    if species is not None:
        # If we're passed a species ensure that it is in fact the correct species
        data_species = str(data["parameter_formula"].values[0]).lower()

        passed_species = species.lower()
        if data_species != passed_species:
            raise ValueError(
                f"Mismatch between passed species ({passed_species}) and species read from data ({data_species})"
            )

    species = species.upper()

    flag = []
    selection_flag = []
    for flag_str in data.analysis_flag:
        flag.append(flag_str[0] == ".")
        selection_flag.append(int(flag_str[1] != "."))

    combined_data = {}

    data[species + "_status_flag"] = flag
    data[species + "_selection_flag"] = selection_flag

    data = data[data[species + "_status_flag"]]

    data = data[[
        "sample_latitude",
        "sample_longitude",
        "sample_altitude",
        "analysis_value",
        "analysis_uncertainty",
        species + "_selection_flag",
    ]]

    rename_dict = {
        "analysis_value": species,
        "analysis_uncertainty": species + "_repeatability",
        "sample_longitude": "longitude",
        "sample_latitude": "latitude",
        "sample_altitude": "altitude",
    }

    data = data.rename(columns=rename_dict, inplace=False)
    data = data.to_xarray()

    # TODO  - this could do with a better name
    noaa_params = load_json("attributes.json")["NOAA"]

    site_attributes = noaa_params["global_attributes"]
    site_attributes["inlet_height_magl"] = "NA"
    site_attributes["instrument"] = noaa_params["instrument"][species.upper()]
    site_attributes["sampling_period"] = sampling_period

    metadata = {}
    metadata["species"] = clean_string(species)
    metadata["site"] = site
    metadata["measurement_type"] = measurement_type
    metadata["network"] = "NOAA"
    metadata["inlet"] = inlet
    metadata["sampling_period"] = sampling_period
    metadata["instrument"] = noaa_params["instrument"][species.upper()]

    combined_data[species.lower()] = {
        "metadata": metadata,
        "data": data,
        "attributes": site_attributes,
    }

    return combined_data
示例#10
0
def _read_obspack(
    data_filepath: Union[str, Path],
    site: str,
    inlet: str,
    sampling_period: str,
    measurement_type: str,
    instrument: Optional[str] = None,
) -> Dict[str, Dict]:
    """Read NOAA ObsPack NetCDF files

    Args:
        data_filepath: Path to file
        site: Three letter site code
        inlet: Inlet height, if no height use measurement type e.g. flask
        measurement_type: One of flask, insity or pfp
        instrument: Instrument name
        sampling_period: Sampling period
    Returns:
        dict: Dictionary of results
    """
    import xarray as xr
    from openghg.util import clean_string
    from openghg.standardise.meta import assign_attributes

    valid_types = ("flask", "insitu", "pfp")

    if measurement_type not in valid_types:
        raise ValueError(f"measurement_type must be one of {valid_types}")

    obspack_ds = xr.open_dataset(data_filepath)
    # orig_attrs = obspack_ds.attrs

    # Want to find and drop any duplicate time values for the original dataset
    # Using xarray directly we have to do in a slightly convoluted way as this is not well built
    # into the xarray workflow yet - https://github.com/pydata/xarray/pull/5239
    # - can use da.drop_duplicates() but only on one variable at a time and not on the whole Dataset
    # This method keeps attributes for each of the variables including units

    # The dimension within the original dataset is called "obs" and has no associated coordinates
    # Extract time from original Dataset (dimension is "obs")
    time = obspack_ds.time

    # To keep associated "obs" dimension, need to assign coordinate values to this (just 0, len(obs))
    time = time.assign_coords(obs=obspack_ds.obs)

    # Make "time" the primary dimension (while retaining "obs") and add "time" values as coordinates
    time = time.swap_dims(dims_dict={"obs": "time"})
    time = time.assign_coords(time=time)

    # Drop any duplicate time values and extract the associated "obs" values
    # TODO: Work out what to do with duplicates - may be genuine multiple measurements
    time_unique = time.drop_duplicates(dim="time", keep="first")
    obs_unique = time_unique.obs

    # Estimate sampling period using metadata and midpoint time
    if sampling_period == "NOT_SET":
        sampling_period_estimate = _estimate_sampling_period(obspack_ds)
    else:
        sampling_period_estimate = -1.0

    species = clean_string(obspack_ds.attrs["dataset_parameter"])
    network = "NOAA"

    # Use these obs values to filter the original dataset to remove any repeated times
    processed_ds = obspack_ds.sel(obs=obs_unique)
    processed_ds = processed_ds.set_coords(["time"])

    # Rename variables to match our internal standard
    # "value_std_dev" --> f"{species}_variability"
    # "value_unc" --> ??
    # TODO: Clarify what "value_unc" should be renamed to

    variable_names = {
        "value": species,
        "value_std_dev": f"{species}_variability",
        "value_unc": f"{species}_variability",  # May need to be updated
        "nvalue": f"{species}_number_of_observations"
    }

    to_extract = [
        name for name in variable_names.keys() if name in processed_ds
    ]
    name_dict = {
        name: key
        for name, key in variable_names.items() if name in to_extract
    }

    if not to_extract:
        wanted = variable_names.keys()
        raise ValueError(
            f"No valid data columns found in converted DataFrame. We expect the following data variables in the passed NetCDF: {wanted}"
        )

    processed_ds = processed_ds[to_extract]
    processed_ds = processed_ds.rename(name_dict)
    processed_ds = processed_ds.sortby("time")

    try:
        # Extract units attribute from value data variable
        units = processed_ds[species].units
    except (KeyError, AttributeError):
        print("Unable to extract units from 'value' within input dataset")
    else:
        if units == "mol mol-1":
            units = "1"
        elif units == "millimol mol-1":
            units = "1e-3"
        elif units == "micromol mol-1":
            units = "1e-6"
        elif units == "nmol mol-1":
            units = "1e-9"
        elif units == "pmol mol-1":
            units = "1e-12"
        else:
            print(f"Using unit {units} directly")
            # raise ValueError(f"Did not recognise input units from file: {units}")

    metadata = {}
    metadata["site"] = site
    metadata["inlet"] = inlet
    metadata["network"] = network
    metadata["measurement_type"] = measurement_type
    metadata["species"] = species
    metadata["units"] = units
    metadata["sampling_period"] = sampling_period

    if instrument is not None:
        metadata["instrument"] = instrument
    else:
        try:
            metadata["instrument"] = obspack_ds.attrs["instrument"]
        except KeyError:
            pass

    if sampling_period_estimate >= 0.0:
        metadata["sampling_period_estimate"] = str(
            sampling_period_estimate
        )  # convert to string to keep consistent with "sampling_period"

    # TODO: At the moment all attributes from the NOAA ObsPack are being copied
    # plus any variables we're adding.
    # - decide if we want to reduce this
    attributes = obspack_ds.attrs
    attributes["sampling_period"] = sampling_period
    if sampling_period_estimate >= 0.0:
        attributes["sampling_period_estimate"] = str(sampling_period_estimate)

    gas_data = {
        species: {
            "data": processed_ds,
            "metadata": metadata,
            "attributes": attributes
        }
    }

    gas_data = assign_attributes(data=gas_data, site=site, network=network)

    return gas_data
示例#11
0
def parse_cranfield(
    data_filepath: Union[str, Path],
    site: Optional[str] = None,
    network: Optional[str] = None,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    sampling_period: Optional[str] = None,
    measurement_type: Optional[str] = None,
) -> Dict:
    """Creates a CRDS object holding data stored within Datasources

    Args:
        filepath: Path of file to load
        data_filepath : Filepath of data to be read
        site: Name of site
        network: Name of network
    Returns:
        dict: Dictionary of gas data
    """
    from pandas import read_csv
    from openghg.util import clean_string

    if sampling_period is None:
        sampling_period = "NOT_SET"

    data_filepath = Path(data_filepath)
    data = read_csv(data_filepath, parse_dates=["Date"], index_col="Date")

    data = data.rename(
        columns={
            "Methane/ppm": "ch4",
            "Methane stdev/ppm": "ch4 variability",
            "CO2/ppm": "co2",
            "CO2 stdev/ppm": "co2 variability",
            "CO/ppm": "co",
            "CO stdev/ppm": "co variability",
        }
    )
    data.index.name = "time"

    # Convert CH4 and CO to ppb
    data["ch4"] = data["ch4"] * 1e3
    data["ch4 variability"] = data["ch4 variability"] * 1e3
    data["co"] = data["co"] * 1e3
    data["co variability"] = data["co variability"] * 1e3

    metadata = {}
    metadata["site"] = "THB"
    metadata["instrument"] = "CRDS"
    metadata["sampling_period"] = str(sampling_period)
    metadata["height"] = "10magl"
    metadata["inlet"] = "10magl"
    metadata["network"] = "CRANFIELD"

    # TODO - this feels fragile
    species: List[str] = [col for col in data.columns if " " not in col]

    combined_data = {}
    # Number of columns of data for each species
    n_cols = 2

    for n, sp in enumerate(species):
        # for sp in species:
        # Create a copy of the metadata dict
        species_metadata = metadata.copy()
        species_metadata["species"] = str(clean_string(sp))

        # Here we don't want to match the co in co2
        # For now we'll just have 2 columns for each species
        # cols = [col for col in data.columns if sp in col]
        gas_data = data.iloc[:, n * n_cols : (n + 1) * n_cols]

        # Convert from a pandas DataFrame to an xarray Dataset
        gas_data = gas_data.to_xarray()

        combined_data[sp] = {"metadata": species_metadata, "data": gas_data}

    return combined_data
示例#12
0
文件: _npl.py 项目: openghg/openghg
def parse_npl(
    data_filepath: pathType,
    site: str = "NPL",
    network: str = "LGHG",
    inlet: str = None,
    instrument: str = None,
    sampling_period: str = None,
    measurement_type: str = None,
) -> Dict:
    """Reads NPL data files and returns the UUIDS of the Datasources
    the processed data has been assigned to

    Args:
        data_filepath: Path of file to load
        site: Site name
    Returns:
        list: UUIDs of Datasources data has been assigned to
    """

    if sampling_period is None:
        sampling_period = "NOT_SET"

    data_filepath = Path(data_filepath)

    site = "NPL"

    attributes_data = load_json(filename="attributes.json")
    npl_params = attributes_data["NPL"]

    # mypy doesn't like NaT or NaNs - look into this
    def parser(date: str):  # type: ignore
        try:
            return datetime.strptime(str(date), "%d/%m/%Y %H:%M")
        except ValueError:
            return NaT

    data = read_csv(data_filepath, index_col=0, date_parser=parser)

    # Drop the NaT/NaNs
    data = data.loc[data.index.dropna()]

    # Rename columns
    rename_dict = {"Cal_CO2_dry": "CO2", "Cal_CH4_dry": "CH4"}

    data = data.rename(columns=rename_dict)
    data.index.name = "time"

    if inlet is None:
        inlet = "NA"

    gas_data = {}
    for species in data.columns:
        processed_data = data.loc[:, [species]].sort_index().to_xarray()

        # Convert methane to ppb
        if species == "CH4":
            processed_data[species] *= 1000

        # No averaging applied to raw obs, set variability to 0 to allow get_obs to calculate
        # when averaging
        processed_data["{} variability".format(
            species)] = processed_data[species] * 0.0

        site_attributes = npl_params["global_attributes"]
        site_attributes["inlet_height_magl"] = npl_params["inlet"]
        site_attributes["instrument"] = npl_params["instrument"]

        metadata = {
            "species": clean_string(species),
            "sampling_period": str(sampling_period),
            "site": "NPL",
            "network": "LGHG",
            "inlet": inlet,
        }

        # TODO - add in better metadata reading
        gas_data[species] = {
            "metadata": metadata,
            "data": processed_data,
            "attributes": site_attributes,
        }

    gas_data = assign_attributes(data=gas_data, site=site, network=network)

    return gas_data
示例#13
0
    def read_file(
        filepath: multiPathType,
        data_type: str,
        network: str,
        site: str,
        inlet: str = None,
        instrument: str = None,
        sampling_period: str = None,
        measurement_type: str = "insitu",
        overwrite: bool = False,
    ) -> Dict:
        """Process files and store in the object store. This function
            utilises the process functions of the other classes in this submodule
            to handle each data type.

        Args:
            filepath: Filepath(s)
            data_type: Data type, for example CRDS, GCWERKS
            site: Site code/name
            network: Network name
            inlet: Inlet height. If retrieve multiple files pass None, OpenGHG will attempt to
            read inlets from data.
            instrument: Instrument name
            sampling_period: Sampling period in pandas style (e.g. 2H for 2 hour period, 2m for 2 minute period).
            measurement_type: Type of measurement e.g. insitu, flask
            overwrite: Overwrite previously uploaded data
        Returns:
            dict: Dictionary of Datasource UUIDs
        """
        from collections import defaultdict
        from pathlib import Path
        from pandas import Timedelta
        import sys
        from tqdm import tqdm
        from openghg.util import load_surface_parser, hash_file, clean_string, verify_site
        from openghg.types import SurfaceTypes
        from openghg.store import assign_data

        if not isinstance(filepath, list):
            filepath = [filepath]

        try:
            data_type = SurfaceTypes[data_type.upper()].value
        except KeyError:
            raise ValueError(f"Unknown data type {data_type} selected.")

        # Test that the passed values are valid
        # Check validity of site, instrument, inlet etc in acrg_site_info.json
        # Clean the strings
        site = verify_site(site=site)

        network = clean_string(network)
        inlet = clean_string(inlet)
        instrument = clean_string(instrument)
        sampling_period = clean_string(sampling_period)

        sampling_period_seconds: Union[str, None] = None
        # If we have a sampling period passed we want the number of seconds
        if sampling_period is not None:
            sampling_period_seconds = str(Timedelta(sampling_period).total_seconds())

        # Load the data retrieve object
        parser_fn = load_surface_parser(data_type=data_type)

        obs = ObsSurface.load()

        results: resultsType = defaultdict(dict)

        # Create a progress bar object using the filepaths, iterate over this below
        with tqdm(total=len(filepath), file=sys.stdout) as progress_bar:
            for fp in filepath:
                if data_type == "GCWERKS":
                    try:
                        data_filepath = Path(fp[0])
                        precision_filepath = Path(fp[1])
                    except ValueError:
                        raise ValueError("For GCWERKS data both data and precision filepaths must be given.")
                else:
                    data_filepath = Path(fp)

                # try:
                file_hash = hash_file(filepath=data_filepath)
                if file_hash in obs._file_hashes and overwrite is False:
                    raise ValueError(
                        f"This file has been uploaded previously with the filename : {obs._file_hashes[file_hash]}."
                    )

                progress_bar.set_description(f"Processing: {data_filepath.name}")

                if data_type == "GCWERKS":
                    data = parser_fn(
                        data_filepath=data_filepath,
                        precision_filepath=precision_filepath,
                        site=site,
                        network=network,
                        inlet=inlet,
                        instrument=instrument,
                        sampling_period=sampling_period_seconds,
                        measurement_type=measurement_type,
                    )
                else:
                    data = parser_fn(
                        data_filepath=data_filepath,
                        site=site,
                        network=network,
                        inlet=inlet,
                        instrument=instrument,
                        sampling_period=sampling_period_seconds,
                        measurement_type=measurement_type,
                    )

                # Extract the metadata for each set of measurements to perform a Datasource lookup
                metadata = {key: data["metadata"] for key, data in data.items()}

                lookup_results = obs.datasource_lookup(metadata=metadata)

                # Create Datasources, save them to the object store and get their UUIDs
                datasource_uuids = assign_data(
                    data_dict=data, lookup_results=lookup_results, overwrite=overwrite
                )

                results["processed"][data_filepath.name] = datasource_uuids

                # Record the Datasources we've created / appended to
                obs.add_datasources(datasource_uuids, metadata)

                # Store the hash as the key for easy searching, store the filename as well for
                # ease of checking by user
                obs._file_hashes[file_hash] = data_filepath.name
                # except Exception:
                #     results["error"][data_filepath.name] = traceback.format_exc()

                progress_bar.update(1)

        # Save this object back to the object store
        obs.save()

        return results
示例#14
0
文件: _btt.py 项目: openghg/openghg
def parse_btt(
    data_filepath: Union[str, Path],
    site: Optional[str] = "BTT",
    network: Optional[str] = "LGHG",
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
) -> Dict:
    """Reads NPL data files and returns the UUIDS of the Datasources
    the processed data has been assigned to

    Args:
        data_filepath: Path of file to load
        site: Site name
    Returns:
        dict: Dictionary of gas data
    """
    from openghg.standardise.meta import assign_attributes
    from pandas import read_csv, Timestamp, to_timedelta, isnull
    from numpy import nan as np_nan
    from openghg.util import clean_string, load_json

    # TODO: Decide what to do about inputs which aren't use anywhere
    # at present - inlet, instrument, sampling_period, measurement_type

    data_filepath = Path(data_filepath)

    site = "BTT"

    # Rename these columns
    rename_dict = {"co2.cal": "CO2", "ch4.cal.ppb": "CH4"}
    # We only want these species
    species_extract = ["CO2", "CH4"]
    # Take std-dev measurements from these columns for these species
    species_sd = {"CO2": "co2.sd.ppm", "CH4": "ch4.sd.ppb"}

    param_data = load_json(filename="attributes.json")
    network_params = param_data["BTT"]

    sampling_period = int(network_params["sampling_period"])
    sampling_period_seconds = str(sampling_period) + "s"

    data = read_csv(data_filepath)
    data["time"] = Timestamp("2019-01-01 00:00") + to_timedelta(
        data["DOY"] - 1, unit="D")
    data["time"] = data["time"].dt.round(sampling_period_seconds)
    data = data[~isnull(data.time)]

    data = data.rename(columns=rename_dict)
    data = data.set_index("time")

    gas_data = {}
    for species in species_extract:
        processed_data = data.loc[:, [species]].sort_index()
        # Create a variability column
        species_stddev_label = species_sd[species]
        processed_data[species][f"{species} variability"] = data[
            species_stddev_label]

        # Replace any values below zero with NaNs
        processed_data[processed_data < 0] = np_nan
        # Drop NaNs
        processed_data = processed_data.dropna()
        # Convert to a Dataset
        processed_data = processed_data.to_xarray()

        site_attributes = network_params["global_attributes"]
        site_attributes["inlet_height_magl"] = network_params["inlet"]
        site_attributes["instrument"] = network_params["instrument"]
        site_attributes["sampling_period"] = sampling_period

        # TODO - add in better metadata reading
        metadata = {
            "species": clean_string(species),
            "sampling_period": str(sampling_period),
            "site": "BTT",
        }

        gas_data[species] = {
            "metadata": metadata,
            "data": processed_data,
            "attributes": site_attributes,
        }

    gas_data = assign_attributes(data=gas_data, site=site, network=network)

    return gas_data
示例#15
0
def parse_beaco2n(
    data_filepath: Union[str, Path],
    site: str,
    network: str,
    inlet: str,
    instrument: Optional[str] = "shinyei",
    sampling_period: Optional[str] = None,
) -> Dict:
    """Read BEACO2N data files

    Args:
        data_filepath: Data filepath
        site: Site name
        network: Network name
        inlet: Inlet height in metres
        instrument: Instrument name
        sampling_period: Measurement sampling period
    Returns:
        dict: Dictionary of data
    """
    import pandas as pd
    from openghg.util import load_json
    from collections import defaultdict
    from openghg.util import clean_string

    if sampling_period is None:
        sampling_period = "NOT_SET"

    data_filepath = Path(data_filepath)
    datetime_columns = {"time": ["datetime"]}
    use_cols = [1, 5, 6, 7, 8, 9, 10]
    na_values = [-999.0]

    site = clean_string(site)

    try:
        data = pd.read_csv(
            data_filepath,
            index_col="time",
            usecols=use_cols,
            parse_dates=datetime_columns,
            na_values=na_values,
        )
    except ValueError as e:
        raise ValueError(
            f"Unable to read data file, please make sure it is in the standard BEACO2N format.\nError: {e}"
        )

    beaco2n_site_data = load_json("beaco2n_site_data.json")

    try:
        site_metadata = beaco2n_site_data[site.upper()]
    except KeyError:
        raise ValueError(f"Site {site} not recognized.")

    site_metadata["comment"] = "Retrieved from http://beacon.berkeley.edu/"

    # Check which columns we have in the data and build the rename dict
    possible_rename_cols = {
        "PM_ug/m3": "pm",
        "PM_ug/m3_QC_level": "pm_qc",
        "co2_ppm": "co2",
        "co2_ppm_QC_level": "co2_qc",
        "co_ppm": "co",
        "co_ppm_QC_level": "co_qc",
    }
    # Not all columns are in data from different sites, i.e. Glasgow has a CO column
    rename_cols = {k: v for k, v in possible_rename_cols.items() if k in data}
    # Set all values below zero to NaN
    data = data.rename(columns=rename_cols)

    # Read the columns available and make sure we have them to iterate over
    possible_measurement_types = ["pm", "co", "co2"]
    measurement_types = [c for c in possible_measurement_types if c in data]

    units = {"pm": "ug/m3", "co2": "ppm", "co": "ppm"}

    gas_data: DefaultDict[str, Dict[str, Union[DataFrame,
                                               Dict]]] = defaultdict(dict)
    for mt in measurement_types:
        m_data = data[[mt, f"{mt}_qc"]]
        m_data = m_data.dropna(axis="rows", subset=[mt])

        # Some sites don't have data for each type, skip that type if all NaNs
        if m_data.index.empty:
            continue

        m_data = m_data.to_xarray()

        species_metadata = {
            "units": units[mt],
            "site": site,
            "species": clean_string(mt),
            "inlet": clean_string(inlet),
            "network": "beaco2n",
            "sampling_period": str(sampling_period),
            "instrument": instrument,
        }

        gas_data[mt]["data"] = m_data
        gas_data[mt]["metadata"] = species_metadata
        gas_data[mt]["attributes"] = site_metadata

    # TODO - add CF Compliant attributes?

    return gas_data
示例#16
0
def search(**kwargs):  # type: ignore
    """Search for observations data. Any keyword arguments may be passed to the
    the function and these keywords will be used to search the metadata associated
    with each Datasource.

    Example / commonly used arguments are given below.

    Args:
        species: Terms to search for in Datasources
        locations: Where to search for the terms in species
        inlet: Inlet height such as 100m
        instrument: Instrument name such as picarro
        find_all: Require all search terms to be satisfied
        start_date: Start datetime for search.
        If None a start datetime of UNIX epoch (1970-01-01) is set
        end_date: End datetime for search.
        If None an end datetime of the current datetime is set
        skip_ranking: If True skip ranking system, defaults to False
    Returns:
        dict: List of keys of Datasources matching the search parameters
    """
    from addict import Dict as aDict
    from copy import deepcopy
    from itertools import chain as iter_chain

    from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel
    from openghg.store.base import Datasource

    from openghg.util import (
        timestamp_now,
        timestamp_epoch,
        timestamp_tzaware,
        clean_string,
        closest_daterange,
        find_daterange_gaps,
        split_daterange_str,
        load_json,
    )
    from openghg.dataobjects import SearchResults

    # Get a copy of kwargs as we make some modifications below
    kwargs_copy = deepcopy(kwargs)

    # Do this here otherwise we have to produce them for every datasource
    start_date = kwargs.get("start_date")
    end_date = kwargs.get("end_date")

    if start_date is None:
        start_date = timestamp_epoch()
    else:
        start_date = timestamp_tzaware(start_date)

    if end_date is None:
        end_date = timestamp_now()
    else:
        end_date = timestamp_tzaware(end_date)

    kwargs_copy["start_date"] = start_date
    kwargs_copy["end_date"] = end_date

    skip_ranking = kwargs_copy.get("skip_ranking", False)

    try:
        del kwargs_copy["skip_ranking"]
    except KeyError:
        pass

    # As we might have kwargs that are None we want to get rid of those
    search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None}

    # Speices translation

    species = search_kwargs.get("species")

    if species is not None:
        if not isinstance(species, list):
            species = [species]

        translator = load_json("species_translator.json")

        updated_species = []

        for s in species:
            updated_species.append(s)

            try:
                translated = translator[s]
            except KeyError:
                pass
            else:
                updated_species.extend(translated)

        search_kwargs["species"] = updated_species

    data_type = search_kwargs.get("data_type", "timeseries")

    valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model")
    if data_type not in valid_data_types:
        raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}")

    # Assume we want timeseries data
    obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load()

    if data_type == "footprints":
        obj = Footprints.load()
    elif data_type == "emissions":
        obj = Emissions.load()
    elif data_type == "eulerian_model":
        obj = EulerianModel.load()

    datasource_uuids = obj.datasources()

    # Shallow load the Datasources so we can search their metadata
    datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids)

    # For the time being this will return a dict until we know how best to represent
    # the footprints and emissions results in a SearchResult object
    if data_type in {"emissions", "footprints", "eulerian_model"}:
        sources: Dict = aDict()
        for datasource in datasources:
            if datasource.search_metadata(**search_kwargs):
                uid = datasource.uuid()
                sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)
                sources[uid]["metadata"] = datasource.metadata()

        return sources

    # Find the Datasources that contain matching metadata
    matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)}

    # TODO - Update this as it only uses the ACRG repo JSON at the moment
    # Check if this site only has one inlet, if so skip ranking
    # if "site" in search_kwargs:
    #     site = search_kwargs["site"]
    #     if not isinstance(site, list) and not multiple_inlets(site=site):
    #         skip_ranking = True

    # If there isn't *any* ranking data at all, skip all the ranking functionality
    if not obj._rank_data:
        skip_ranking = True

    # If only one datasource has been returned, skip all the ranking functionality
    if len(matching_sources) == 1:
        skip_ranking = True

    # If we have the site, inlet and instrument then just return the data
    # TODO - should instrument be added here
    if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True:
        specific_sources = aDict()
        for datasource in matching_sources.values():
            specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)

            if not specific_keys:
                continue

            metadata = datasource.metadata()

            site = metadata["site"]
            species = metadata["species"]
            inlet = metadata["inlet"]

            specific_sources[site][species][inlet]["keys"] = specific_keys
            specific_sources[site][species][inlet]["metadata"] = metadata

        return SearchResults(results=specific_sources.to_dict(), ranked_data=False)

    highest_ranked = aDict()

    for uid, datasource in matching_sources.items():
        # Find the site and then the ranking
        metadata = datasource.metadata()
        # Get the site inlet and species
        site = metadata["site"]
        species = metadata["species"]

        rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date)

        # If this Datasource doesn't have any ranking data skip it and move on
        if not rank_data:
            continue

        # There will only be a single rank key
        rank_value = next(iter(rank_data))
        # Get the daterange this rank covers
        rank_dateranges = rank_data[rank_value]

        # Each match we store gives us the information we need
        # to retrieve the data
        match = {"uuid": uid, "dateranges": rank_dateranges}

        # Need to ensure we get all the dates covered
        if species in highest_ranked[site]:
            species_rank_data = highest_ranked[site][species]

            # If we have a higher (lower number) rank save it
            if rank_value < species_rank_data["rank"]:
                species_rank_data["rank"] = rank_value
                species_rank_data["matching"] = [match]
            # If another Datasource has the same rank for another daterange
            # we want to save that as well
            elif rank_value == species_rank_data["rank"]:
                species_rank_data["matching"].append(match)
        else:
            highest_ranked[site][species]["rank"] = rank_value
            highest_ranked[site][species]["matching"] = [match]

    if not highest_ranked:
        raise ValueError(
            (
                "No ranking data set for the given search parameters."
                " Please refine your search to include a specific site, species and inlet."
            )
        )
    # Now we have the highest ranked data the dateranges there are ranks for
    # we want to fill in the gaps with (currently) the highest inlet from that site

    # We just want some rank_metadata to go along with the final data scheme
    # Can key a key of date - inlet
    data_keys: Dict = aDict()
    for site, species in highest_ranked.items():
        for sp, data in species.items():
            # data_keys[site][sp]["keys"] = []

            species_keys = []
            species_rank_data = {}
            species_metadata = {}

            for match_data in data["matching"]:
                uuid = match_data["uuid"]
                match_dateranges = match_data["dateranges"]
                # Get the datasource as it's already in the dictionary
                # we created earlier
                datasource = matching_sources[uuid]
                metadata = datasource.metadata()
                inlet = metadata["inlet"]

                keys = []
                for dr in match_dateranges:
                    date_keys = datasource.keys_in_daterange_str(daterange=dr)

                    if date_keys:
                        keys.extend(date_keys)
                        # We'll add this to the metadata in the search results we return at the end
                        species_rank_data[dr] = inlet

                species_keys.extend(keys)
                species_metadata[inlet] = metadata

            # Only create the dictionary keys if we have some data keys
            if species_keys:
                data_keys[site][sp]["keys"] = species_keys
                data_keys[site][sp]["rank_metadata"] = species_rank_data
                data_keys[site][sp]["metadata"] = species_metadata
            else:
                continue

            # We now need to retrieve data for the dateranges for which we don't have ranking data
            # To do this find the gaps in the daterange over which the user has requested data
            # and the dates for which we have ranking information

            # Get the dateranges that are covered by ranking information
            daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]]))
            # Find the gaps in the ranking coverage
            gap_dateranges = find_daterange_gaps(
                start_search=start_date, end_search=end_date, dateranges=daterange_strs
            )

            # We want the dateranges and inlets for those dateranges
            inlet_dateranges = data_keys[site][sp]["rank_metadata"]
            # These are the dateranges for which we have ranking information for this site and species
            ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys())

            for gap_daterange in gap_dateranges:
                # We want to select the inlet that's ranked for dates closest to the ones we have here
                closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges)

                gap_start, gap_end = split_daterange_str(gap_daterange)
                # Find the closest ranked inlet by date
                chosen_inlet = inlet_dateranges[closest_dr]

                inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet]
                inlet_instrument = inlet_metadata["instrument"]
                inlet_sampling_period = inlet_metadata["sampling_period"]

                # Then we want to retrieve the correct metadata for those inlets
                results: SearchResults = search(
                    site=site,
                    species=sp,
                    inlet=chosen_inlet,
                    instrument=inlet_instrument,
                    sampling_period=inlet_sampling_period,
                    start_date=gap_start,
                    end_date=gap_end,
                )  # type: ignore

                if not results:
                    continue

                # Retrieve the data keys
                inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet)

                data_keys[site][sp]["keys"].extend(inlet_data_keys)

            # Remove any duplicate keys
            data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"]))

    # TODO - create a stub for addict
    dict_data_keys = data_keys.to_dict()  # type: ignore

    return SearchResults(results=dict_data_keys, ranked_data=True)
示例#17
0
def _split_species(
    data: DataFrame,
    site: str,
    instrument: str,
    species: List,
    metadata: Dict,
    units: Dict,
    scale: Dict,
    gc_params: Dict,
) -> Dict:
    """Splits the species into separate dataframe into sections to be stored within individual Datasources

    Args:
        data: DataFrame of raw data
        site: Name of site from which this data originates
        instrument: Name of instrument
        species: List of species contained in data
        metadata: Dictionary of metadata
        units: Dictionary of units for each species
        scale: Dictionary of scales for each species
        gc_params: GCWERKS parameter dictionary
    Returns:
        dict: Dataframe of gas data and metadata
    """
    from addict import Dict as aDict
    from fnmatch import fnmatch
    from openghg.util import load_json, clean_string

    # Load species translator so we can keep species names consistent
    attributes_data = load_json("attributes.json")
    species_translator = attributes_data["species_translation"]

    # Read inlets from the parameters
    expected_inlets = _get_inlets(site_code=site, gc_params=gc_params)

    try:
        data_inlets = data["Inlet"].unique().tolist()
    except KeyError:
        raise KeyError(
            "Unable to read inlets from data, please ensure this data is of the GC type expected by this retrieve module"
        )

    combined_data = aDict()

    for spec in species:
        # Skip this species if the data is all NaNs
        if data[spec].isnull().all():
            continue

        # Here inlet is the inlet in the data and inlet_label is the label we want to use as metadata
        for inlet, inlet_label in expected_inlets.items():
            # Create a copy of metadata for local modification
            spec_metadata = metadata.copy()
            spec_metadata["units"] = units[spec]
            spec_metadata["scale"] = scale[spec]

            # If we've only got a single inlet
            if inlet == "any" or inlet == "air":
                spec_data = data[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]
                spec_data = spec_data.dropna(axis="index", how="any")
                spec_metadata["inlet"] = inlet_label
            elif "date" in inlet:
                dates = inlet.split("_")[1:]
                data_sliced = data.loc[dates[0]:dates[1]]

                spec_data = data_sliced[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]
                spec_data = spec_data.dropna(axis="index", how="any")
                spec_metadata["inlet"] = inlet_label
            else:
                # Find the inlet
                matching_inlets = [i for i in data_inlets if fnmatch(i, inlet)]

                if not matching_inlets:
                    continue

                # Only set the label in metadata when we have the correct label
                spec_metadata["inlet"] = inlet_label
                # There should only be one matching label
                select_inlet = matching_inlets[0]
                # Take only data for this inlet from the dataframe
                inlet_data = data.loc[data["Inlet"] == select_inlet]

                spec_data = inlet_data[[
                    spec,
                    spec + " repeatability",
                    spec + " status_flag",
                    spec + " integration_flag",
                    "Inlet",
                ]]

                spec_data = spec_data.dropna(axis="index", how="any")

            # Now we drop the inlet column
            spec_data = spec_data.drop("Inlet", axis="columns")

            # Check that the Dataframe has something in it
            if spec_data.empty:
                continue

            attributes = _get_site_attributes(site=site,
                                              inlet=inlet_label,
                                              instrument=instrument,
                                              gc_params=gc_params)
            attributes = attributes.copy()

            # We want an xarray Dataset
            spec_data = spec_data.to_xarray()

            # Create a standardised / cleaned species label
            try:
                comp_species = species_translator[spec.upper()]["chem"]
            except KeyError:
                comp_species = clean_string(spec.lower())

            # Add the cleaned species name to the metadata and alternative name if present
            spec_metadata["species"] = comp_species
            if comp_species != spec.lower() and comp_species != spec.upper():
                spec_metadata["species_alt"] = spec

            # Rename variables so they have lowercase and alphanumeric names
            to_rename = {}
            for var in spec_data.variables:
                if spec in var:
                    new_name = var.replace(spec, comp_species)
                    to_rename[var] = new_name

            spec_data = spec_data.rename(to_rename)

            # As a single species may have measurements from multiple inlets we
            # use the species and inlet as a key
            data_key = f"{comp_species}_{inlet_label}"

            combined_data[data_key]["metadata"] = spec_metadata
            combined_data[data_key]["data"] = spec_data
            combined_data[data_key]["attributes"] = attributes

    to_return: Dict = combined_data.to_dict()

    return to_return