Пример #1
0
def test_key_date_compare():
    d = Datasource()

    keys = {
        "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00":
        "data/uuid/test-uid/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00",
        "2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00":
        "data/uuid/test-uid/v1/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00",
        "2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00":
        "data/uuid/test-uid/v1/2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00",
        "2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00":
        "data/uuid/test-uid/v1/2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00",
        "2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00":
        "data/uuid/test-uid/v1/2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00",
        "2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00":
        "data/uuid/test-uid/v1/2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00",
        "2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00":
        "data/uuid/test-uid/v1/2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00",
    }

    start = timestamp_tzaware("2014-01-01")
    end = timestamp_tzaware("2018-01-01")

    in_date = d.key_date_compare(keys=keys, start_date=start, end_date=end)

    expected = [
        "data/uuid/test-uid/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00",
        "data/uuid/test-uid/v1/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00",
        "data/uuid/test-uid/v1/2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00",
        "data/uuid/test-uid/v1/2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00",
    ]

    assert in_date == expected

    start = timestamp_tzaware("2053-01-01")
    end = timestamp_tzaware("2092-01-01")

    in_date = d.key_date_compare(keys=keys, start_date=start, end_date=end)

    assert not in_date

    error_key = {
        "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00_2014-11-30-11:23:30+00:00":
        "broken"
    }

    with pytest.raises(ValueError):
        in_date = d.key_date_compare(keys=error_key,
                                     start_date=start,
                                     end_date=end)
Пример #2
0
    def from_data(cls: Type[T], bucket: str, data: Dict, shallow: bool) -> T:
        """Construct a Datasource from JSON

        Args:
            bucket: Bucket containing data
            data: JSON data
            shallow: Load only the JSON data, do not retrieve data from the object store
        Returns:
            Datasource: Datasource created from JSON
        """
        from openghg.util import timestamp_tzaware

        d = cls()
        d._uuid = data["UUID"]
        d._creation_datetime = timestamp_tzaware(data["creation_datetime"])
        d._metadata = data["metadata"]
        d._stored = data["stored"]
        d._data_keys = data["data_keys"]
        d._data = {}
        d._data_type = data["data_type"]
        d._latest_version = data["latest_version"]

        if d._stored and not shallow:
            for date_key in d._data_keys["latest"]["keys"]:
                data_key = d._data_keys["latest"]["keys"][date_key]
                d._data[date_key] = Datasource.load_dataset(bucket=bucket,
                                                            key=data_key)

        d._stored = False

        d.update_daterange()

        return d
Пример #3
0
    def in_daterange(self, start_date: Union[str, Timestamp],
                     end_date: Union[str, Timestamp]) -> bool:
        """Check if the data contained within this Datasource overlaps with the
        dates given.

        Args:
            start: Start datetime
            end: End datetime
        Returns:
            bool: True if overlap
        """
        from openghg.util import timestamp_tzaware

        # if self._start_date is None or self._end_date is None:
        #     self.update_daterange()

        start_date = timestamp_tzaware(start_date)
        end_date = timestamp_tzaware(end_date)

        return bool((start_date <= self._end_date)
                    and (end_date >= self._start_date))
Пример #4
0
    def get_dataframe_daterange(
            self, dataframe: DataFrame) -> Tuple[Timestamp, Timestamp]:
        """Returns the daterange for the passed DataFrame

        Args:
            dataframe: DataFrame to parse
        Returns:
            tuple (Timestamp, Timestamp): Start and end Timestamps for data
        """
        from pandas import DatetimeIndex
        from openghg.util import timestamp_tzaware

        if not isinstance(dataframe.index, DatetimeIndex):
            raise TypeError(
                "Only DataFrames with a DatetimeIndex must be passed")

        # Here we want to make the pandas Timestamps timezone aware
        start = timestamp_tzaware(dataframe.first_valid_index())
        end = timestamp_tzaware(dataframe.last_valid_index())

        return start, end
Пример #5
0
def retrieve_met(
    site: str,
    network: str,
    years: Union[str, List[str]],
    variables: Optional[List[str]] = None,
) -> METData:
    """Retrieve METData data. Note that this function will only download a
    full year of data which may take some time.

    This function currently on retrieves data from the "reanalysis-era5-pressure-levels"
    dataset but may be modified for other datasets in the future.
    Args:
        site: Three letter sitec code
        network: Network
        years: Year(s) of data required
    Returns:
        METData: METData object holding data and metadata
    """
    from openghg.dataobjects import METData

    if variables is None:
        variables = ["u_component_of_wind", "v_component_of_wind"]

    latitude, longitude, site_height, inlet_heights = _get_site_data(
        site=site, network=network)

    # Get the area to retrieve data for
    ecmwf_area = _get_ecmwf_area(site_lat=latitude, site_long=longitude)
    # Calculate the pressure at measurement height(s)
    measure_pressure = _get_site_pressure(inlet_heights=inlet_heights,
                                          site_height=site_height)
    # Calculate the ERA5 pressure levels required
    ecmwf_pressure_levels = _altitude_to_ecmwf_pressure(
        measure_pressure=measure_pressure)

    if not isinstance(years, list):
        years = [years]
    else:
        years = sorted(years)

    # TODO - we might need to customise this further in the future to
    # request other types of weather data
    request = {
        "product_type": "reanalysis",
        "format": "netcdf",
        "variable": variables,
        "pressure_level": ecmwf_pressure_levels,
        "year": [str(x) for x in years],
        "month": [str(x).zfill(2) for x in range(1, 13)],
        "day": [str(x).zfill(2) for x in range(1, 32)],
        "time": [f"{str(x).zfill(2)}:00" for x in range(0, 24)],
        "area": ecmwf_area,
    }

    cds_client = cdsapi.Client()
    dataset_name = "reanalysis-era5-pressure-levels"

    # Retrieve metadata from Copernicus about the dataset, this includes
    # the location of the data netCDF file.
    result = cds_client.retrieve(name=dataset_name, request=request)

    # Download the data itself
    dataset = _download_data(url=result.location)
    # dataset = xr.open_dataset("/home/gar/Documents/Devel/RSE/openghg/tests/data/request_return.nc")

    # We replace the date data with a start and end date here
    start_date = str(timestamp_tzaware(f"{years[0]}-1-1"))
    end_date = str(timestamp_tzaware(f"{years[-1]}-12-31"))

    metadata = {
        "product_type": request["product_type"],
        "format": request["format"],
        "variable": request["variable"],
        "pressure_level": request["pressure_level"],
        "area": request["area"],
        "site": site,
        "network": network,
        "start_date": start_date,
        "end_date": end_date,
    }

    return METData(data=dataset, metadata=metadata)
Пример #6
0
def single_site_footprint(
    site: str,
    height: str,
    network: str,
    domain: str,
    species: str,
    start_date: Union[str, Timestamp],
    end_date: Union[str, Timestamp],
    resample_to: str = "coarsest",
    site_modifier: Optional[str] = None,
    platform: Optional[str] = None,
    instrument: Optional[str] = None,
) -> Dataset:
    """Creates a Dataset for a single site's measurement data and footprints

    Args:
        site: Site name
        height: Height of inlet in metres
        network: Network name
        resample_to: Resample the data to a given time dataset.
        Valid options are ["obs", "footprints", "coarsen"].
            - "obs" resamples the footprints to the observation time series data
            - "footprints" resamples to to the footprints time series
            - "coarsest" resamples to the data with the coarsest time resolution
        site_modifier: The name of the site given in the footprints.
                       This is useful for example if the same site footprints are run with a different met and
                       they are named slightly differently from the obs file. E.g.
                       site="DJI", site_modifier = "DJI-SAM" - station called DJI, footprints site called DJI-SAM
        platform: Observation platform used to decide whether to resample
        instrument:
        species: Species type
    Returns:
        xarray.Dataset
    """
    from openghg.retrieve import get_obs_surface, get_footprint
    from openghg.util import timestamp_tzaware

    start_date = timestamp_tzaware(start_date)
    end_date = timestamp_tzaware(end_date)

    resample_to = resample_to.lower()
    resample_choices = ("obs", "footprints", "coarsest")
    if resample_to not in resample_choices:
        raise ValueError(
            f"Invalid resample choice {resample_to} past, please select from one of {resample_choices}"
        )

    # As we're not retrieve any satellite data yet just set tolerance to None
    tolerance = None
    platform = None

    # Here we want to use get_obs_surface
    obs_results = get_obs_surface(
        site=site,
        inlet=height,
        start_date=start_date,
        end_date=end_date,
        species=species,
        instrument=instrument,
    )

    obs_data = obs_results.data

    # Save the observation data units
    try:
        units: Union[float, None] = float(obs_data.mf.attrs["units"])
    except KeyError:
        units = None
    except AttributeError:
        raise AttributeError(
            "Unable to read mf attribute from observation data.")

    # If the site for the footprints has a different name, pass that in
    if site_modifier:
        footprint_site = site_modifier
    else:
        footprint_site = site

    # Try to find appropriate footprints file first with and then without species name
    try:
        footprint = get_footprint(
            site=footprint_site,
            domain=domain,
            height=height,
            start_date=start_date,
            end_date=end_date,
            species=species,
        )
    except ValueError:
        footprint = get_footprint(
            site=footprint_site,
            domain=domain,
            height=height,
            start_date=start_date,
            end_date=end_date,
        )

    # TODO: Add checks for particular species e.g. co2 and short-lived species
    # which should have a specific footprints available rather than the generic one

    # Extract dataset
    footprint_data = footprint.data

    # Align the two Datasets
    aligned_obs, aligned_footprint = align_datasets(
        obs_data=obs_data,
        footprint_data=footprint_data,
        platform=platform,
        resample_to=resample_to,
    )

    combined_dataset = combine_datasets(dataset_A=aligned_obs,
                                        dataset_B=aligned_footprint,
                                        tolerance=tolerance)

    # Transpose to keep time in the last dimension position in case it has been moved in resample
    combined_dataset = combined_dataset.transpose(..., "time")

    if units is not None:
        combined_dataset["fp"].values = combined_dataset["fp"].values / units
        # if HiTRes:
        #     combined_dataset.update({"fp_HiTRes": (combined_dataset.fp_HiTRes.dims, (combined_dataset.fp_HiTRes / units))})

    return combined_dataset