Пример #1
0
    def __init__(self):
        from HUGS.Util import load_hugs_json

        self._sampling_period = 0
        # Load site data 
        data = load_hugs_json(filename="process_gcwerks_parameters.json")
        self._gc_params = data["GCWERKS"]
        # Site codes for inlet readings
        self._site_codes = load_hugs_json(filename="site_codes.json")
Пример #2
0
    def __init__(self):
        # Holds parameters used for writing attributes to Datasets
        self._crds_params = {}
        # Sampling period of CRDS data in seconds
        self._sampling_period = 60

        data = load_hugs_json(filename="process_gcwerks_parameters.json")
        self._crds_params = data["CRDS"]
Пример #3
0
    def __init__(self):
        from HUGS.Util import load_hugs_json

        self._eurocom_params = {}
        # Sampling period of EUROCOM data in seconds
        self._sampling_period = 60

        data = load_hugs_json(filename="attributes.json")
        self._eurocom_params = data["EUROCOM"]
Пример #4
0
    def __init__(self):
        from HUGS.Util import load_hugs_json

        # Holds parameters used for writing attributes to Datasets
        self._tb_params = {}
        # Sampling period of  data in seconds
        self._sampling_period = "NA"

        data = load_hugs_json(filename="attributes.json")
        self._tb_params = data["TMB"]
Пример #5
0
def synonyms(species: str) -> str:
    """
    Check to see if there are other names that we should be using for
    a particular input. E.g. If CFC-11 or CFC11 was input, go on to use cfc-11,
    as this is used in species_info.json

    Args:
        species (str): Input string that you're trying to match
    Returns:
        str: Matched species string
    """
    from HUGS.Util import load_hugs_json

    # Load in the species data
    species_data = load_hugs_json(filename="acrg_species_info.json")

    # First test whether site matches keys (case insensitive)
    matched_strings = [k for k in species_data if k.upper() == species.upper()]

    # Used to access the alternative names in species_data
    alt_label = "alt"

    # If not found, search synonyms
    if not matched_strings:
        for key in species_data:
            # Iterate over the alternative labels and check for a match
            matched_strings = [
                s for s in species_data[key][alt_label]
                if s.upper() == species.upper()
            ]

            if matched_strings:
                matched_strings = [key]
                break

    if matched_strings:
        updated_species = matched_strings[0]

        return updated_species
    else:
        raise ValueError(f"Unable to find synonym for species {species}")
Пример #6
0
    def get_site_attributes(self, site, inlet):
        """ Gets the site specific attributes for writing to Datsets

            Args:
                site (str): Site name
                inlet (str): Inlet (example: 108m)
            Returns:
                dict: Dictionary of attributes
        """
        from HUGS.Util import load_hugs_json

        if not self._crds_params:
            data = load_hugs_json(filename="process_gcwerks_parameters.json")
            self._crds_params = data["CRDS"]

        try:
            attributes = self._crds_params[site.upper()]["global_attributes"]
        except KeyError:
            raise ValueError(f"Unable to read attributes for site: {site}")

        attributes["inlet_height_magl"] = inlet.split("_")[0]
        attributes["comment"] = self._crds_params["comment"]

        return attributes
Пример #7
0
    def read_data(self, data_filepath, site, network):
        """ Separates the gases stored in the dataframe in
            separate dataframes and returns a dictionary of gases
            with an assigned UUID as gas:UUID and a list of the processed
            dataframes

            Args:
                data_filepath (pathlib.Path): Path of datafile
            Returns:
                dict: Dictionary containing metadata, data and attributes keys
        """
        from datetime import datetime
        from pandas import RangeIndex, read_csv, NaT

        # At the moment we're using the filename as the source name
        source_name = data_filepath.stem
        # -1 here as we've already removed the file extension
        # As we're not processing a list of datafiles here we'll only have one inlet
        inlet = source_name.split(".")[3]

        if "m" not in inlet.lower():
            raise ValueError(
                "No inlet found, we expect filenames such as: bsd.picarro.1minute.108m.dat"
            )

        # Function to parse the datetime format found in the datafile
        def parse_date(date):
            try:
                return datetime.strptime(date, "%y%m%d %H%M%S")
            except ValueError:
                return NaT

        data = read_csv(
            data_filepath,
            header=None,
            skiprows=1,
            sep=r"\s+",
            index_col=["0_1"],
            parse_dates=[[0, 1]],
            date_parser=parse_date,
        )

        data.index.name = "time"

        # Drop any rows with NaNs
        # This is now done before creating metadata
        data = data.dropna(axis="rows", how="any")

        # Get the number of gases in dataframe and number of columns of data present for each gas
        n_gases, n_cols = self.gas_info(data=data)

        header = data.head(2)
        skip_cols = sum(
            [header[column][0] == "-" for column in header.columns])

        metadata = self.read_metadata(filepath=data_filepath, data=data)

        if network is not None:
            metadata["network"] = network

        # Read the scale from JSON
        crds_data = load_hugs_json(filename="process_gcwerks_parameters.json")

        # This dictionary is used to store the gas data and its associated metadata
        combined_data = {}

        for n in range(n_gases):
            # Slice the columns
            gas_data = data.iloc[:, skip_cols + n * n_cols:skip_cols +
                                 (n + 1) * n_cols]

            # Reset the column numbers
            gas_data.columns = RangeIndex(gas_data.columns.size)
            species = gas_data[0][0]
            species = species.lower()

            column_labels = [species, f"{species} stdev", f"{species} n_meas"]

            # Name columns
            gas_data = gas_data.set_axis(column_labels,
                                         axis="columns",
                                         inplace=False)

            header_rows = 2
            # Drop the first two rows now we have the name
            gas_data = gas_data.drop(index=gas_data.head(header_rows).index,
                                     inplace=False)
            # Cast data to float64 / double
            gas_data = gas_data.astype("float64")

            # Here we can convert the Dataframe to a Dataset and then write the attributes
            gas_data = gas_data.to_xarray()

            site_attributes = self.get_site_attributes(site=site, inlet=inlet)

            # Create a copy of the metadata dict
            scale = crds_data["CRDS"]["default_scales"].get(species.upper())

            species_metadata = metadata.copy()
            species_metadata["species"] = species
            species_metadata["inlet"] = inlet
            species_metadata["scale"] = scale

            combined_data[species] = {
                "metadata": species_metadata,
                "data": gas_data,
                "attributes": site_attributes,
            }

        return combined_data
Пример #8
0
def get_single_site(
    site: str,
    species: str,
    network: Optional[str] = None,
    start_date: Optional[Union[str, Timestamp]] = None,
    end_date: Optional[Union[str, Timestamp]] = None,
    inlet: Optional[str] = None,
    average: Optional[str] = None,
    instrument: Optional[str] = None,
    keep_missing: Optional[bool] = False,
    calibration_scale: Optional[str] = None,
) -> list:
    """ Get measurements from one site as a list of xarray datasets.
        If there are multiple instruments and inlets at a particular site, 
        note that the acrg_obs_defaults.csv file may be referenced to determine which instrument and inlet to use for each time period.
        If an inlet or instrument changes at some point during time period, multiple datasets will be returned,
        one for each inlet/instrument.

        Args:    
            site:
                Site of interest e.g. MHD for the Mace Head site.
            species_in (str) :
                Species identifier e.g. ch4 for methane.
            start_date: 
                Output start date in a format that Pandas can interpret
            end_date: 
                Output end date in a format that Pandas can interpret
            inlet: 
                Inlet label. If you want to merge all inlets, use "all"
            average:
                Averaging period for each dataset.
                Each value should be a string of the form e.g. "2H", "30min" (should match pandas offset aliases format).
            keep_missing:
                Whether to keep missing data points or drop them.
            network: 
                Network for the site/instrument (must match number of sites).
            instrument:
                Specific instrument for the site (must match number of sites). 
            calibration_scale:
                Convert to this calibration scale (original scale and new scale must both be in acrg_obs_scale_convert.csv)
        Returns:
            list: List of xarray.Datasets
    """
    from pandas import Timestamp, Timedelta
    import numpy as np
    from xarray import concat as xr_concat
    from HUGS.LocalClient import Search
    from HUGS.Util import load_hugs_json

    site_info = load_hugs_json(filename="acrg_site_info.json")
    site = site.upper()

    if site not in site_info:
        raise ValueError(
            f"No site called {site}, please enter a valid site name.")

    # Ensure we have the Timestamps we expect
    if start_date is not None and not isinstance(start_date, Timestamp):
        start_date = Timestamp(start_date)
    if end_date is not None and not isinstance(end_date, Timestamp):
        end_date = Timestamp(end_date)

    # Find the correct synonym for the passed species
    species = synonyms(species)

    search = Search()

    results = search.search(
        species=species,
        locations=site,
        inlet=inlet,
        instrument=instrument,
        start_datetime=start_date,
        end_datetime=end_date,
    )

    # Retrieve all the data found
    selected_keys = [k for k in results]
    retrieved_data = search.retrieve(selected_keys=selected_keys)

    obs_files = []

    for key, dateranges in retrieved_data.items():
        for d in dateranges:
            split_dates = d.split("_")

            start_date = Timestamp(split_dates[0])
            end_date = Timestamp(split_dates[1])

            data = dateranges[d]

            if average is not None:
                if keep_missing is True:

                    # Create a dataset with one element and NaNs to prepend or append
                    ds_single_element = data[{"time": 0}]

                    for v in ds_single_element.variables:
                        if v != "time":
                            ds_single_element[v].values = np.nan

                    ds_concat = []
                    # Pad with an empty entry at the start date
                    if min(data.time) > Timestamp(start_date):
                        ds_single_element_start = ds_single_element.copy()
                        ds_single_element_start.time.values = Timestamp(
                            start_date)
                        ds_concat.append(ds_single_element_start)

                    ds_concat.append(data)

                    # Pad with an empty entry at the end date
                    if max(data.time) < Timestamp(end_date):
                        ds_single_element_end = ds_single_element.copy()
                        ds_single_element_end.time.values = Timestamp(
                            end_date) - Timedelta("1ns")
                        ds_concat.append(ds_single_element_end)

                    data = xr_concat(ds_concat, dim="time")

                    # Now sort to get everything in the right order
                    data = data.sortby("time")

                # First do a mean resample on all variables
                ds_resampled = data.resample(
                    time=average, keep_attrs=True).mean(skipna=False)
                # keep_attrs doesn't seem to work for some reason, so manually copy
                ds_resampled.attrs = data.attrs.copy()

                # For some variables, need a different type of resampling
                for var in data.variables:
                    if "repeatability" in var:
                        ds_resampled[var] = (
                            np.sqrt(
                                (data[var]**2).resample(time=average).sum()) /
                            data[var].resample(time=average).count())

                    elif "variability" in var:
                        # Calculate std of 1 min mf obs in av period as new vmf
                        ds_resampled[var] = (data[var].resample(
                            time=average, keep_attrs=True).std(skipna=False))

                    # Copy over some attributes
                    if "long_name" in data[var].attrs:
                        ds_resampled[var].attrs["long_name"] = data[var].attrs[
                            "long_name"]
                    if "units" in data[var].attrs:
                        ds_resampled[var].attrs["units"] = data[var].attrs[
                            "units"]

                data = ds_resampled.copy()

            # Rename variables
            rename = {}

            for var in data.variables:
                if var.lower() == species.lower():
                    rename[var] = "mf"
                if "repeatability" in var:
                    rename[var] = "mf_repeatability"
                if "variability" in var:
                    rename[var] = "mf_variability"
                if "number_of_observations" in var:
                    rename[var] = "mf_number_of_observations"
                if "status_flag" in var:
                    rename[var] = "status_flag"
                if "integration_flag" in var:
                    rename[var] = "integration_flag"

            data = data.rename_vars(rename)

            data.attrs["species"] = species
            if "Calibration_scale" in data.attrs:
                data.attrs["scale"] = data.attrs.pop("Calibration_scale")

            if calibration_scale is not None:
                data = scale_convert(data, species, calibration_scale)

            obs_files.append(data)

    # Now check if the units match for each of the observation Datasets
    units = set([f.mf.attrs["units"] for f in obs_files])
    if len(units) > 1:
        raise ValueError(
            f"Units do not match for these observation Datasets {[(f.mf.attrs['units'],f.attrs['filename']) for f in obs_files]}"
        )

    scales = set([f.attrs["scale"] for f in obs_files])
    if len(scales) > 1:
        print(
            f"Scales do not match for these observation Datasets {[(f.attrs['scale'],f.attrs['filename']) for f in obs_files]}"
        )
        print("Suggestion: set calibration_scale to convert scales")

    return obs_files
Пример #9
0
    def __init__(self):
        from HUGS.Util import load_hugs_json

        # Holds parameters used for writing attributes to Datasets
        data = load_hugs_json("attributes.json")
        self._noaa_params = data["NOAA"]