示例#1
0
def trim_daterange(to_trim: str, overlapping: str) -> str:
    """Removes overlapping dates from to_trim

    Args:
        to_trim: Daterange to trim down. Dates that overlap
        with overlap_daterange will be removed from to_trim
        overlap_daterange: Daterange containing dates we want to trim
        from to_trim
    Returns:
        str: Trimmed daterange
    """
    from pandas import Timedelta

    if not daterange_overlap(daterange_a=to_trim, daterange_b=overlapping):
        raise ValueError(
            f"Dateranges {to_trim} and {overlapping} do not overlap")

    # We need to work out which way round they overlap
    start_trim, end_trim = split_daterange_str(to_trim)
    start_overlap, end_overlap = split_daterange_str(overlapping)

    delta_gap = Timedelta("1s")

    # Work out if to_trim is before or after the overlap_daterange
    if start_trim < start_overlap and end_overlap > end_trim:
        new_end_trim = start_overlap - delta_gap
        return create_daterange_str(start=start_trim, end=new_end_trim)
    else:
        new_start_trim = end_overlap + delta_gap
        return create_daterange_str(start=new_start_trim, end=end_trim)
示例#2
0
def daterange_contains(container: str, contained: str) -> bool:
    """Check if the daterange container contains the daterange contained

    Args:
        container: Daterange
        contained: Daterange
    Returns:
        bool
    """
    start_a, end_a = split_daterange_str(container)
    start_b, end_b = split_daterange_str(contained)

    return bool(start_a <= start_b and end_b <= end_a)
示例#3
0
def closest_daterange(to_compare: str, dateranges: Union[str,
                                                         List[str]]) -> str:
    """Finds the closest daterange in a list of dateranges

    Args:
        to_compare: Daterange (as a string) to compare
        dateranges: List of dateranges
    Returns:
        str: Daterange from dateranges that's the closest in time to to_compare
    """
    from openghg.util import split_daterange_str
    from pandas import Timedelta

    min_start = Timedelta("3650days")
    min_end = Timedelta("3650days")

    if not isinstance(dateranges, list):
        dateranges = [dateranges]

    dateranges = sorted(dateranges)

    start_comp, end_comp = split_daterange_str(daterange_str=to_compare)
    # We want to iterate over the dateranges and first check if they overlap
    # if they do, return that daterange
    # otherwise check how far apart the
    for daterange in dateranges:
        # If they're close to overlap the start and end will be close
        start, end = split_daterange_str(daterange_str=daterange)

        # Check for an overlap
        if start <= end_comp and end >= start_comp:
            raise ValueError("Overlapping daterange.")

        # Find the min between all the starts and all the ends
        diff_start_end = abs(start_comp - end)
        if diff_start_end < min_start:
            min_start = diff_start_end
            closest_daterange_start = daterange

        diff_end_start = abs(end_comp - start)
        if diff_end_start < min_end:
            min_end = diff_end_start
            closest_daterange_end = daterange

    if min_start < min_end:
        return closest_daterange_start
    else:
        return closest_daterange_end
示例#4
0
def test_split_daterange_str():

    start_true = Timestamp("2001-01-01-00:00:00", tz="UTC")
    end_true = Timestamp("2001-03-01-00:00:00", tz="UTC")

    daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00"

    start, end = split_daterange_str(daterange_str=daterange_1)

    assert start_true == start
    assert end_true == end
示例#5
0
    def update_daterange(self) -> None:
        """Update the dates stored by this Datasource

        Returns:
            None
        """
        from openghg.util import split_daterange_str

        # If we've only shallow loaded (without the data)
        # this Datasource we use the latest data keys
        if not self._data:
            date_keys = sorted(self._data_keys["latest"]["keys"])
        else:
            date_keys = sorted(self._data.keys())

        start, _ = split_daterange_str(daterange_str=date_keys[0])
        _, end = split_daterange_str(daterange_str=date_keys[-1])

        self._start_date = start
        self._end_date = end
示例#6
0
def sanitise_daterange(daterange: str) -> str:
    """Make sure the daterange is correct and return
    tzaware daterange.

    Args:
        daterange: Daterange str
    Returns:
        str: Timezone aware daterange str
    """
    start, end = split_daterange_str(daterange)

    if start >= end:
        raise ValueError("Invalid daterange, start after end date")

    return create_daterange_str(start=start, end=end)
示例#7
0
def valid_daterange(daterange: str) -> bool:
    """Check if the passed daterange is valid

    Args:
        daterange: Daterange string
    Returns:
        bool: True if valid
    """
    from openghg.util import split_daterange_str

    start, end = split_daterange_str(daterange)

    if start >= end:
        return False

    return True
示例#8
0
def combine_dateranges(dateranges: List[str]) -> List[str]:
    """Combine dateranges

    Args:
        dateranges: Daterange strings
    Returns:
        list: List of combined dateranges

    Modified from
    https://codereview.stackexchange.com/a/69249
    """
    if len(dateranges) == 1:
        return dateranges

    def sort_key(tup: Tuple) -> Timestamp:
        return tup[0]

    intervals = [split_daterange_str(x) for x in dateranges]
    sorted_by_lower_bound = sorted(intervals, key=sort_key)

    combined: List[Timestamp] = []

    for higher in sorted_by_lower_bound:
        if not combined:
            combined.append(higher)
        else:
            lower = combined[-1]
            # Test for intersection between lower and higher:
            # We know via sorting that lower[0] <= higher[0]
            if higher[0] <= lower[1]:
                upper_bound = max(lower[1], higher[1])
                # Replace by combined interval
                combined[-1] = (lower[0], upper_bound)
            else:
                combined.append(higher)

    combined_strings = [
        create_daterange_str(start=a, end=b) for a, b in combined
    ]

    return combined_strings
示例#9
0
def search(**kwargs):  # type: ignore
    """Search for observations data. Any keyword arguments may be passed to the
    the function and these keywords will be used to search the metadata associated
    with each Datasource.

    Example / commonly used arguments are given below.

    Args:
        species: Terms to search for in Datasources
        locations: Where to search for the terms in species
        inlet: Inlet height such as 100m
        instrument: Instrument name such as picarro
        find_all: Require all search terms to be satisfied
        start_date: Start datetime for search.
        If None a start datetime of UNIX epoch (1970-01-01) is set
        end_date: End datetime for search.
        If None an end datetime of the current datetime is set
        skip_ranking: If True skip ranking system, defaults to False
    Returns:
        dict: List of keys of Datasources matching the search parameters
    """
    from addict import Dict as aDict
    from copy import deepcopy
    from itertools import chain as iter_chain

    from openghg.store import ObsSurface, Footprints, Emissions, EulerianModel
    from openghg.store.base import Datasource

    from openghg.util import (
        timestamp_now,
        timestamp_epoch,
        timestamp_tzaware,
        clean_string,
        closest_daterange,
        find_daterange_gaps,
        split_daterange_str,
        load_json,
    )
    from openghg.dataobjects import SearchResults

    # Get a copy of kwargs as we make some modifications below
    kwargs_copy = deepcopy(kwargs)

    # Do this here otherwise we have to produce them for every datasource
    start_date = kwargs.get("start_date")
    end_date = kwargs.get("end_date")

    if start_date is None:
        start_date = timestamp_epoch()
    else:
        start_date = timestamp_tzaware(start_date)

    if end_date is None:
        end_date = timestamp_now()
    else:
        end_date = timestamp_tzaware(end_date)

    kwargs_copy["start_date"] = start_date
    kwargs_copy["end_date"] = end_date

    skip_ranking = kwargs_copy.get("skip_ranking", False)

    try:
        del kwargs_copy["skip_ranking"]
    except KeyError:
        pass

    # As we might have kwargs that are None we want to get rid of those
    search_kwargs = {k: clean_string(v) for k, v in kwargs_copy.items() if v is not None}

    # Speices translation

    species = search_kwargs.get("species")

    if species is not None:
        if not isinstance(species, list):
            species = [species]

        translator = load_json("species_translator.json")

        updated_species = []

        for s in species:
            updated_species.append(s)

            try:
                translated = translator[s]
            except KeyError:
                pass
            else:
                updated_species.extend(translated)

        search_kwargs["species"] = updated_species

    data_type = search_kwargs.get("data_type", "timeseries")

    valid_data_types = ("timeseries", "footprints", "emissions", "eulerian_model")
    if data_type not in valid_data_types:
        raise ValueError(f"{data_type} is not a valid data type, please select one of {valid_data_types}")

    # Assume we want timeseries data
    obj: Union[ObsSurface, Footprints, Emissions, EulerianModel] = ObsSurface.load()

    if data_type == "footprints":
        obj = Footprints.load()
    elif data_type == "emissions":
        obj = Emissions.load()
    elif data_type == "eulerian_model":
        obj = EulerianModel.load()

    datasource_uuids = obj.datasources()

    # Shallow load the Datasources so we can search their metadata
    datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids)

    # For the time being this will return a dict until we know how best to represent
    # the footprints and emissions results in a SearchResult object
    if data_type in {"emissions", "footprints", "eulerian_model"}:
        sources: Dict = aDict()
        for datasource in datasources:
            if datasource.search_metadata(**search_kwargs):
                uid = datasource.uuid()
                sources[uid]["keys"] = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)
                sources[uid]["metadata"] = datasource.metadata()

        return sources

    # Find the Datasources that contain matching metadata
    matching_sources = {d.uuid(): d for d in datasources if d.search_metadata(**search_kwargs)}

    # TODO - Update this as it only uses the ACRG repo JSON at the moment
    # Check if this site only has one inlet, if so skip ranking
    # if "site" in search_kwargs:
    #     site = search_kwargs["site"]
    #     if not isinstance(site, list) and not multiple_inlets(site=site):
    #         skip_ranking = True

    # If there isn't *any* ranking data at all, skip all the ranking functionality
    if not obj._rank_data:
        skip_ranking = True

    # If only one datasource has been returned, skip all the ranking functionality
    if len(matching_sources) == 1:
        skip_ranking = True

    # If we have the site, inlet and instrument then just return the data
    # TODO - should instrument be added here
    if {"site", "inlet", "species"} <= search_kwargs.keys() or skip_ranking is True:
        specific_sources = aDict()
        for datasource in matching_sources.values():
            specific_keys = datasource.keys_in_daterange(start_date=start_date, end_date=end_date)

            if not specific_keys:
                continue

            metadata = datasource.metadata()

            site = metadata["site"]
            species = metadata["species"]
            inlet = metadata["inlet"]

            specific_sources[site][species][inlet]["keys"] = specific_keys
            specific_sources[site][species][inlet]["metadata"] = metadata

        return SearchResults(results=specific_sources.to_dict(), ranked_data=False)

    highest_ranked = aDict()

    for uid, datasource in matching_sources.items():
        # Find the site and then the ranking
        metadata = datasource.metadata()
        # Get the site inlet and species
        site = metadata["site"]
        species = metadata["species"]

        rank_data = obj.get_rank(uuid=uid, start_date=start_date, end_date=end_date)

        # If this Datasource doesn't have any ranking data skip it and move on
        if not rank_data:
            continue

        # There will only be a single rank key
        rank_value = next(iter(rank_data))
        # Get the daterange this rank covers
        rank_dateranges = rank_data[rank_value]

        # Each match we store gives us the information we need
        # to retrieve the data
        match = {"uuid": uid, "dateranges": rank_dateranges}

        # Need to ensure we get all the dates covered
        if species in highest_ranked[site]:
            species_rank_data = highest_ranked[site][species]

            # If we have a higher (lower number) rank save it
            if rank_value < species_rank_data["rank"]:
                species_rank_data["rank"] = rank_value
                species_rank_data["matching"] = [match]
            # If another Datasource has the same rank for another daterange
            # we want to save that as well
            elif rank_value == species_rank_data["rank"]:
                species_rank_data["matching"].append(match)
        else:
            highest_ranked[site][species]["rank"] = rank_value
            highest_ranked[site][species]["matching"] = [match]

    if not highest_ranked:
        raise ValueError(
            (
                "No ranking data set for the given search parameters."
                " Please refine your search to include a specific site, species and inlet."
            )
        )
    # Now we have the highest ranked data the dateranges there are ranks for
    # we want to fill in the gaps with (currently) the highest inlet from that site

    # We just want some rank_metadata to go along with the final data scheme
    # Can key a key of date - inlet
    data_keys: Dict = aDict()
    for site, species in highest_ranked.items():
        for sp, data in species.items():
            # data_keys[site][sp]["keys"] = []

            species_keys = []
            species_rank_data = {}
            species_metadata = {}

            for match_data in data["matching"]:
                uuid = match_data["uuid"]
                match_dateranges = match_data["dateranges"]
                # Get the datasource as it's already in the dictionary
                # we created earlier
                datasource = matching_sources[uuid]
                metadata = datasource.metadata()
                inlet = metadata["inlet"]

                keys = []
                for dr in match_dateranges:
                    date_keys = datasource.keys_in_daterange_str(daterange=dr)

                    if date_keys:
                        keys.extend(date_keys)
                        # We'll add this to the metadata in the search results we return at the end
                        species_rank_data[dr] = inlet

                species_keys.extend(keys)
                species_metadata[inlet] = metadata

            # Only create the dictionary keys if we have some data keys
            if species_keys:
                data_keys[site][sp]["keys"] = species_keys
                data_keys[site][sp]["rank_metadata"] = species_rank_data
                data_keys[site][sp]["metadata"] = species_metadata
            else:
                continue

            # We now need to retrieve data for the dateranges for which we don't have ranking data
            # To do this find the gaps in the daterange over which the user has requested data
            # and the dates for which we have ranking information

            # Get the dateranges that are covered by ranking information
            daterange_strs = list(iter_chain.from_iterable([m["dateranges"] for m in data["matching"]]))
            # Find the gaps in the ranking coverage
            gap_dateranges = find_daterange_gaps(
                start_search=start_date, end_search=end_date, dateranges=daterange_strs
            )

            # We want the dateranges and inlets for those dateranges
            inlet_dateranges = data_keys[site][sp]["rank_metadata"]
            # These are the dateranges for which we have ranking information for this site and species
            ranked_dateranges = list(data_keys[site][sp]["rank_metadata"].keys())

            for gap_daterange in gap_dateranges:
                # We want to select the inlet that's ranked for dates closest to the ones we have here
                closest_dr = closest_daterange(to_compare=gap_daterange, dateranges=ranked_dateranges)

                gap_start, gap_end = split_daterange_str(gap_daterange)
                # Find the closest ranked inlet by date
                chosen_inlet = inlet_dateranges[closest_dr]

                inlet_metadata = data_keys[site][sp]["metadata"][chosen_inlet]
                inlet_instrument = inlet_metadata["instrument"]
                inlet_sampling_period = inlet_metadata["sampling_period"]

                # Then we want to retrieve the correct metadata for those inlets
                results: SearchResults = search(
                    site=site,
                    species=sp,
                    inlet=chosen_inlet,
                    instrument=inlet_instrument,
                    sampling_period=inlet_sampling_period,
                    start_date=gap_start,
                    end_date=gap_end,
                )  # type: ignore

                if not results:
                    continue

                # Retrieve the data keys
                inlet_data_keys = results.keys(site=site, species=sp, inlet=chosen_inlet)

                data_keys[site][sp]["keys"].extend(inlet_data_keys)

            # Remove any duplicate keys
            data_keys[site][sp]["keys"] = list(set(data_keys[site][sp]["keys"]))

    # TODO - create a stub for addict
    dict_data_keys = data_keys.to_dict()  # type: ignore

    return SearchResults(results=dict_data_keys, ranked_data=True)
示例#10
0
def split_encompassed_daterange(container: str, contained: str) -> Dict:
    """Checks if one of the passed dateranges contains the other, if so, then
    split the larger daterange into three sections.

          <---a--->
    <---------b----------->

    Here b is split into three and we end up with:

    <-dr1-><---a---><-dr2->

    Args:
        daterange_a: Daterange
        daterange_b: Daterange
    Returns:
        dict: Dictionary of results
    """
    from pandas import Timedelta

    container_start, container_end = split_daterange_str(
        daterange_str=container)
    contained_start, contained_end = split_daterange_str(
        daterange_str=contained)

    # First check one contains the other
    if not (container_start <= contained_start
            and contained_end <= container_end):
        raise ValueError(f"Range {container} does not contain {contained}")

    # Gap to add between dateranegs so they don't overlap
    delta_gap = Timedelta("1s")
    # If the difference is less than this we'll assume they're the same timestamp
    tolerance = Timedelta("2h")

    results = {}
    # If one of them starts at the same point we just want to split the range in two
    if abs(contained_start - container_start) < tolerance:
        new_contained = create_daterange_str(start=contained_start,
                                             end=contained_end)
        dr1_start = contained_end + delta_gap
        dr1 = create_daterange_str(start=dr1_start, end=container_end)

        results["container_start"] = dr1
        results["contained"] = new_contained

        return results

    if abs(contained_end - container_end) < tolerance:
        new_contained = create_daterange_str(start=contained_start,
                                             end=contained_end)
        dr1_end = contained_start - delta_gap
        dr1 = create_daterange_str(start=container_start, end=dr1_end)

        results["container_start"] = dr1
        results["contained"] = new_contained

        return results

    dr1_start = container_start
    dr1_end = contained_start - delta_gap
    dr1 = create_daterange_str(start=dr1_start, end=dr1_end)

    dr3_start = contained_end + delta_gap
    dr3_end = container_end
    dr3 = create_daterange_str(start=dr3_start, end=dr3_end)

    # Trim a gap off the end of contained
    new_contained_end = contained_end - delta_gap
    new_contained = create_daterange_str(start=contained_start,
                                         end=new_contained_end)

    results["container_start"] = dr1
    results["contained"] = new_contained
    results["container_end"] = dr3

    return results
示例#11
0
def find_daterange_gaps(start_search: Timestamp, end_search: Timestamp,
                        dateranges: List) -> List[str]:
    """Given a start and end date and a list of dateranges find the gaps.

    For example given a list of dateranges

    example = ['2014-09-02_2014-11-01', '2016-09-02_2018-11-01']

    start = timestamp_tzaware("2012-01-01")
    end = timestamp_tzaware("2019-09-01")

    gaps = find_daterange_gaps(start, end, example)

    gaps == ['2012-01-01-00:00:00+00:00_2014-09-01-00:00:00+00:00',
            '2014-11-02-00:00:00+00:00_2016-09-01-00:00:00+00:00',
            '2018-11-02-00:00:00+00:00_2019-09-01-00:00:00+00:00']

    Args:
        start_search: Start timestamp
        end_search: End timestamp
        dateranges: List of daterange strings
    Returns:
        list: List of dateranges
    """
    from pandas import Timedelta
    from openghg.util import pairwise

    sorted_dateranges = sorted(dateranges)

    # The difference between the start and end of the successived dateranges
    range_gap = "1day"
    # First find the gap between the start and the end
    start_first, end_first = split_daterange_str(sorted_dateranges[0])

    gaps = []
    if start_search < start_first:
        gap_start = start_search
        gap_end = start_first - Timedelta(range_gap)
        gap = create_daterange_str(start=gap_start, end=gap_end)
        gaps.append(gap)

    # Then find the gap between the end
    start_last, end_last = split_daterange_str(sorted_dateranges[-1])

    if end_search > end_last:
        gap_end = end_search
        gap_start = end_last + Timedelta(range_gap)
        gap = create_daterange_str(start=gap_start, end=gap_end)
        gaps.append(gap)

    for a, b in pairwise(sorted_dateranges):
        start_a, end_a = split_daterange_str(a)
        start_b, end_b = split_daterange_str(b)

        # Ignore any that are outside our search window
        if end_a < start_search or start_a > end_search:
            continue

        diff = start_b - end_a
        if diff > Timedelta(range_gap) and diff.value > 0:
            gap_start = end_a + Timedelta(range_gap)
            gap_end = start_b - Timedelta(range_gap)

            gap = create_daterange_str(start=gap_start, end=gap_end)
            gaps.append(gap)
        else:
            pass

    gaps.sort()

    return gaps