Пример #1
0
def test_combining_overlapping_dateranges():
    d = Datasource()

    daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00"
    daterange_2 = "2001-02-01-00:00:00_2001-06-01-00:00:00"

    dateranges = [daterange_1, daterange_2]

    combined = d.combine_dateranges(dateranges=dateranges)

    assert combined == ['2001-01-01-00:00:00+00:00_2001-06-01-00:00:00+00:00']

    daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00"
    daterange_2 = "2001-02-01-00:00:00_2001-06-01-00:00:00"
    daterange_3 = "2001-05-01-00:00:00_2001-08-01-00:00:00"
    daterange_4 = "2004-05-01-00:00:00_2004-08-01-00:00:00"
    daterange_5 = "2004-04-01-00:00:00_2004-09-01-00:00:00"
    daterange_6 = "2007-04-01-00:00:00_2007-09-01-00:00:00"

    dateranges = [daterange_1, daterange_2, daterange_3, daterange_4, daterange_5, daterange_6]

    combined = d.combine_dateranges(dateranges=dateranges)

    assert combined == ['2001-01-01-00:00:00+00:00_2001-08-01-00:00:00+00:00', 
                        '2004-04-01-00:00:00+00:00_2004-09-01-00:00:00+00:00', 
                        '2007-04-01-00:00:00+00:00_2007-09-01-00:00:00+00:00']
Пример #2
0
def test_set_incorrect_rank_raises():
    d = Datasource()    

    daterange = "2027-08-01-00:00:00_2027-12-01-00:00:00"

    with pytest.raises(TypeError):
        d.set_rank(rank=42, daterange=daterange)
Пример #3
0
def test_add_data(data):
    d = Datasource(name="test")

    metadata = data["ch4"]["metadata"]
    ch4_data = data["ch4"]["data"]

    assert ch4_data["ch4"][0] == pytest.approx(1960.24)
    assert ch4_data["ch4 stdev"][0] == pytest.approx(0.236)
    assert ch4_data["ch4 n_meas"][0] == pytest.approx(26.0)

    d.add_data(metadata=metadata, data=ch4_data)

    date_key = "2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00"

    assert d._data[date_key]["ch4"].equals(ch4_data["ch4"])
    assert d._data[date_key]["ch4 stdev"].equals(ch4_data["ch4 stdev"])
    assert d._data[date_key]["ch4 n_meas"].equals(ch4_data["ch4 n_meas"])

    datasource_metadata = d.metadata()

    assert datasource_metadata["data_type"] == "timeseries"
    assert datasource_metadata["inlet"] == "248m"
    assert datasource_metadata["instrument"] == "picarro"
    assert datasource_metadata["port"] == "8"
    assert datasource_metadata["site"] == "bsd"
    assert datasource_metadata["species"] == "ch4"
Пример #4
0
def test_incorrect_datatype_raises(data):
    d = Datasource(name="testing_123")

    metadata = data["ch4"]["metadata"]
    ch4_data = data["ch4"]["data"]

    with pytest.raises(TypeError):
        d.add_data(metadata=metadata, data=ch4_data, data_type="CRDS")
Пример #5
0
def test_set_rank():
    d = Datasource()    

    daterange = "2027-08-01-00:00:00_2027-12-01-00:00:00"

    d.set_rank(rank=1, daterange=daterange)

    assert d._rank[1] == ['2027-08-01-00:00:00_2027-12-01-00:00:00']
Пример #6
0
def test_combining_single_dateranges_returns():
    d = Datasource()

    daterange = "2027-08-01-00:00:00_2027-12-01-00:00:00"

    combined = d.combine_dateranges(dateranges=[daterange])

    assert combined[0] == daterange
Пример #7
0
def test_combining_no_overlap():
    d = Datasource()
    daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00"
    daterange_2 = "2011-02-01-00:00:00_2011-06-01-00:00:00"

    dateranges = [daterange_1, daterange_2]

    combined = d.combine_dateranges(dateranges=dateranges)

    assert combined == ['2001-01-01-00:00:00+00:00_2001-03-01-00:00:00+00:00', '2011-02-01-00:00:00+00:00_2011-06-01-00:00:00+00:00']
Пример #8
0
def test_search_metadata_find_all():
    d = Datasource(name="test_search")

    d._metadata = {"inlet": "100m", "instrument": "violin", "car": "toyota"}

    result = d.search_metadata(search_terms=["100m", "violin", "toyota"], find_all=True)

    assert result is True

    result = d.search_metadata(search_terms=["100m", "violin", "toyota", "swallow"], find_all=True)

    assert result is False
Пример #9
0
def test_split_daterange_str():
    d = Datasource()

    start_true = pd.Timestamp("2001-01-01-00:00:00", tz="UTC")
    end_true = pd.Timestamp("2001-03-01-00:00:00", tz="UTC")

    daterange_1 = "2001-01-01-00:00:00_2001-03-01-00:00:00"


    start, end = d.split_datrange_str(daterange_str=daterange_1)

    assert start_true == start
    assert end_true == end
Пример #10
0
def assign_data(gas_data, lookup_results, overwrite):
    """ Create or get an existing Datasource for each gas in the file

        Args:
            gas_data (dict): Dictionary containing data and metadata for species
        Returns:
            dict: Dictionary of UUIDs of Datasources data has been assigned to keyed by species name
    """
    from HUGS.Modules import Datasource

    uuids = {}
    # Add in copying of attributes, or add attributes to the metadata at an earlier state.
    for species in gas_data:
        metadata = gas_data[species]["metadata"]
        data = gas_data[species]["data"]
        name = lookup_results[species]["name"]
        uuid = lookup_results[species]["uuid"]

        # If we have a UUID for this Datasource load the existing object
        # from the object store
        if uuid:
            datasource = Datasource.load(uuid=uuid)
        else:
            datasource = Datasource(name=name)

        # Add the dataframe to the datasource
        datasource.add_data(metadata=metadata, data=data, overwrite=overwrite)
        # Save Datasource to object store
        datasource.save()

        uuids[name] = datasource.uuid()

    return uuids
Пример #11
0
def test_get_dataframe_daterange():
    n_days = 100
    epoch = datetime.datetime(1970, 1, 1, 1, 1)
    random_data = pd.DataFrame(
        data=np.random.randint(0, 100, size=(100, 4)),
        index=pd.date_range(epoch, epoch + datetime.timedelta(n_days - 1), freq="D"),
        columns=list("ABCD"),
    )

    d = Datasource(name="test")

    start, end = d.get_dataframe_daterange(random_data)

    assert start == pd.Timestamp("1970-01-01 01:01:00+0000")
    assert end == pd.Timestamp("1970-04-10 01:01:00+0000")
Пример #12
0
def test_read_CRDS():
    get_local_bucket(empty=True)

    filepath = get_datapath(filename="bsd.picarro.1minute.248m.dat",
                            data_type="CRDS")

    results = ObsSurface.read_file(filepath=filepath, data_type="CRDS")

    keys = results["bsd.picarro.1minute.248m.dat"].keys()

    expected_keys = sorted([
        "bsd.picarro.1minute.248m_ch4",
        "bsd.picarro.1minute.248m_co",
        "bsd.picarro.1minute.248m_co2",
    ])
    assert sorted(keys) == expected_keys

    # Load up the assigned Datasources and check they contain the correct data
    data = results["bsd.picarro.1minute.248m.dat"]

    ch4_data = Datasource.load(
        uuid=data["bsd.picarro.1minute.248m_ch4"]).data()
    ch4_data = ch4_data["2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00"]

    assert ch4_data.time[0] == Timestamp("2014-01-30T10:52:30")
    assert ch4_data["ch4"][0] == 1960.24
    assert ch4_data["ch4"][-1] == 1952.24
    assert ch4_data["ch4_stdev"][-1] == 0.674
    assert ch4_data["ch4_n_meas"][-1] == 25.0

    obs = ObsSurface.load()

    assert sorted(obs._datasource_names.keys()) == expected_keys
Пример #13
0
def test_read_noaa():
    get_local_bucket(empty=True)

    data_filepath = get_datapath(
        filename="co_pocn25_surface-flask_1_ccgg_event.txt", data_type="NOAA")

    results = ObsSurface.read_file(filepath=data_filepath, data_type="NOAA")

    uuid = results["co_pocn25_surface-flask_1_ccgg_event.txt"][
        "co_pocn25_surface-flask_1_ccgg_event_co"]

    co_data = Datasource.load(uuid=uuid, shallow=False).data()

    assert len(co_data.keys()) == 95

    old_data = co_data["1990-12-02-12:23:00+00:00_1990-12-02-12:23:00+00:00"]

    assert old_data.time[0] == Timestamp("1990-12-02T12:23:00")
    assert old_data.time[-1] == Timestamp("1990-12-02T12:23:00")

    assert old_data["co"][0] == 141.61
    assert old_data["co"][-1] == 141.61

    assert old_data["co_repeatability"][0] == -999.99
    assert old_data["co_repeatability"][-1] == -999.99

    assert old_data["co_selection_flag"][0] == 0
    assert old_data["co_selection_flag"][-1] == 0

    obs = ObsSurface.load()

    assert list(obs._datasource_names.keys()
                )[0] == "co_pocn25_surface-flask_1_ccgg_event_co"
Пример #14
0
def test_read_thames_barrier():
    get_local_bucket(empty=True)

    data_filepath = get_datapath(filename="thames_test_20190707.csv",
                                 data_type="THAMESBARRIER")

    results = ObsSurface.read_file(filepath=data_filepath,
                                   data_type="THAMESBARRIER")

    expected_keys = sorted([
        'thames_test_20190707_CH4', 'thames_test_20190707_CO2',
        'thames_test_20190707_CO'
    ])

    assert sorted(list(
        results["thames_test_20190707.csv"].keys())) == expected_keys

    uuid = results["thames_test_20190707.csv"]["thames_test_20190707_CO2"]

    data = Datasource.load(uuid=uuid, shallow=False).data()
    data = data["2019-07-01-00:39:55+00:00_2019-08-01-00:10:30+00:00"]

    assert data.time[0] == Timestamp("2019-07-01T00:39:55")
    assert data.time[-1] == Timestamp("2019-08-01T00:10:30")
    assert data["co2"][0] == pytest.approx(417.97344761)
    assert data["co2"][-1] == pytest.approx(417.80000653)
    assert data["co2_variability"][0] == 0
    assert data["co2_variability"][-1] == 0

    obs = ObsSurface.load()

    assert sorted(obs._datasource_names.keys()) == expected_keys
Пример #15
0
def test_delete_Datasource():
    bucket = get_local_bucket(empty=True)

    data_filepath = get_datapath(filename="tta.co2.1minute.222m.min.dat",
                                 data_type="ICOS")

    ObsSurface.read_file(filepath=data_filepath, data_type="ICOS")

    obs = ObsSurface.load()

    datasources = obs.datasources()

    uuid = datasources[0]

    datasource = Datasource.load(uuid=uuid)

    data = datasource.data(
    )["2011-12-07-01:38:00+00:00_2011-12-31-19:57:00+00:00"]

    assert data["co2"][0] == pytest.approx(397.334)
    assert data.time[0] == Timestamp("2011-12-07T01:38:00")

    data_keys = datasource.data_keys()

    key = data_keys[0]

    assert exists(bucket=bucket, key=key)

    obs.delete(uuid=uuid)

    assert uuid not in obs.datasources()

    assert not exists(bucket=bucket, key=key)
Пример #16
0
def query_store():
    """ Create a dictionary that can be used to visualise the object store 

        Returns:
            dict: Dictionary of data ? 

    """
    from collections import defaultdict
    from HUGS.Modules import Datasource, ObsSurface

    obs = ObsSurface.load()

    datasource_uuids = obs.datasources()
    datasources = (Datasource.load(uuid=uuid, shallow=True)
                   for uuid in datasource_uuids)

    data = defaultdict(dict)

    for d in datasources:
        metadata = d.metadata()
        result = {
            "site": metadata["site"],
            "species": metadata["species"],
            "instrument": metadata.get("instrument", "Unknown"),
            "network": metadata.get("network")
        }
        data[d.uuid()] = result

    return data


# def visualise_store():
#     """ Visualise the output of the

#     """
Пример #17
0
def test_save_footprint():
    bucket = get_local_bucket(empty=True)

    metadata = {"test": "testing123"}

    dir_path = os.path.dirname(__file__)
    test_data = "../data/emissions"
    filename = "WAO-20magl_EUROPE_201306_downsampled.nc"
    filepath = os.path.join(dir_path, test_data, filename)

    data = xarray.open_dataset(filepath)

    datasource = Datasource(name="test_name")
    datasource.add_data(metadata=metadata, data=data, data_type="footprint")
    datasource.save()

    prefix = f"{Datasource._datasource_root}/uuid/{datasource._uuid}"
    objs = get_object_names(bucket, prefix)

    datasource_2 = Datasource.load(bucket=bucket, key=objs[0])

    date_key = "2013-06-02-00:00:00+00:00_2013-06-30-00:00:00+00:00"

    data = datasource_2._data[date_key]

    assert float(data.pressure[0].values) == pytest.approx(1023.971)
    assert float(data.pressure[2].values) == pytest.approx(1009.940)
    assert float(data.pressure[-1].values) == pytest.approx(1021.303)
Пример #18
0
def test_load_dataset():
    filename = "WAO-20magl_EUROPE_201306_small.nc"
    dir_path = os.path.dirname(__file__)
    test_data = "../data/emissions"
    filepath = os.path.join(dir_path, test_data, filename)

    ds = xarray.load_dataset(filepath)

    metadata = {"some": "metadata"}

    d = Datasource("dataset_test")

    d.add_data(metadata=metadata, data=ds, data_type="footprint")

    d.save()

    keys = d._data_keys["latest"]["keys"]

    key = list(keys.values())[0]

    bucket = get_local_bucket()

    loaded_ds = Datasource.load_dataset(bucket=bucket, key=key)

    assert loaded_ds.equals(ds)
Пример #19
0
def test_search_metadata():
    d = Datasource(name="test_search")

    d._metadata = {"unladen": "swallow", "spam": "eggs"}

    assert d.search_metadata("swallow") == True
    assert d.search_metadata("eggs") == True
    assert d.search_metadata("eggs") == True
    assert d.search_metadata("Swallow") == True

    assert d.search_metadata("beans") == False
    assert d.search_metadata("flamingo") == False
Пример #20
0
def test_in_daterange(data):
    metadata = data["ch4"]["metadata"]
    data = data["ch4"]["data"]

    d = Datasource()
    d.add_data(metadata=metadata, data=data)
    d.save()

    start = pd.Timestamp("2014-1-1")
    end = pd.Timestamp("2014-2-1")

    daterange = create_daterange_str(start=start, end=end)

    d._data_keys["latest"]["2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00"] = ['data/uuid/ace2bb89-7618-4104-9404-a329c2bcd318/v1/2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00']
    d._data_keys["latest"]["2015-01-30-10:52:30+00:00_2016-01-30-14:20:30+00:00"] = ['data/uuid/ace2bb89-7618-4104-9404-a329c2bcd318/v1/2015-01-30-10:52:30+00:00_2016-01-30-14:20:30+00:00']
    d._data_keys["latest"]["2016-01-31-10:52:30+00:00_2017-01-30-14:20:30+00:00"] = ['data/uuid/ace2bb89-7618-4104-9404-a329c2bcd318/v1/2016-01-31-10:52:30+00:00_2017-01-30-14:20:30+00:00']

    keys = d.in_daterange(daterange=daterange)

    assert keys[0].split("/")[-1] == '2014-01-30-10:52:30+00:00_2014-01-30-14:20:30+00:00'
Пример #21
0
def test_exists():
    d = Datasource(name="testing")
    d.save()

    exists = Datasource.exists(datasource_id=d.uuid())

    assert exists == True
Пример #22
0
    def set_rank(self, uuid, rank, daterange):
        """ Set the rank of a Datasource associated with this object.

            This function performs checks to ensure multiple ranks aren't set for
            overlapping dateranges.

            Passing a daterange and rank to this function will overwrite any current 
            daterange stored for that rank.

            Args:
                uuid (str): UUID of Datasource
                rank (int): Rank of data
                daterange (str, list): Daterange(s)
            Returns:
                None
        """
        from HUGS.Modules import Datasource
        from HUGS.Util import daterange_from_str

        if not 0 <= int(rank) <= 10:
            raise TypeError("Rank can only take values 0 (for unranked) to 10. Where 1 is the highest rank.")

        if not isinstance(daterange, list):
            daterange = [daterange]

        try:
            rank_data = self._rank_data[uuid]
            # Check this source isn't ranked differently for the same dates
            for d in daterange:
                # Check we don't have any overlapping dateranges for other ranks
                daterange_obj = daterange_from_str(d)
                # Check the other dateranges for overlapping dates and raise error
                for existing_rank, existing_daterange in rank_data.items():
                    for e in existing_daterange:
                        e = daterange_from_str(e)

                        intersection = daterange_obj.intersection(e)
                        if len(intersection) > 0 and int(existing_rank) != int(rank):
                            raise ValueError(f"This datasource has already got the rank {existing_rank} for dates that overlap the ones given. \
                                                Overlapping dates are {intersection}")
        except KeyError:
            pass

        # Store the rank within the Datasource
        datasource = Datasource.load(uuid=uuid, shallow=True)
        datasource.set_rank(rank=rank, daterange=daterange)
        datasource.save()

        try:
            self._rank_data[uuid][rank].extend(daterange)
        except KeyError:
            self._rank_data[uuid][rank] = daterange
Пример #23
0
def get_data(key_list):
    """ Gets data from the Datasources found by the search function

        Bypass loading the Datasource? Get both then we have metadata?

    """
    from HUGS.Modules import Datasource

    # Get the data
    # This will return a list of lists of data
    # Maybe want to do some preprocessing on this data before it comes raw out of the object store?
    # We only want the data in the correct daterange
    return [Datasource.load(key=key)._data for key in key_list]
Пример #24
0
def test_to_data(data):
    d = Datasource(name="testing_123")

    metadata = data["ch4"]["metadata"]
    ch4_data = data["ch4"]["data"]

    assert ch4_data["ch4"][0] == pytest.approx(1960.24)
    assert ch4_data["ch4 stdev"][0] == pytest.approx(0.236)
    assert ch4_data["ch4 n_meas"][0] == pytest.approx(26.0)

    d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries")

    obj_data = d.to_data()

    metadata = obj_data["metadata"]
    assert obj_data["name"] == "testing_123"
    assert metadata["site"] == "bsd"
    assert metadata["instrument"] == "picarro"
    assert metadata["time_resolution"] == "1_minute"
    assert metadata["inlet"] == "248m"
    assert obj_data["data_type"] == "timeseries"
    assert len(obj_data["data_keys"]) == 0
Пример #25
0
def test_save(mock_uuid2):
    bucket = get_local_bucket()

    datasource = Datasource(name="test_name")
    datasource.add_metadata(key="data_type", value="timeseries")
    datasource.save(bucket)

    prefix = f"{Datasource._datasource_root}/uuid/{datasource._uuid}"

    objs = get_object_names(bucket, prefix)

    assert objs[0].split("/")[-1] == mocked_uuid2
Пример #26
0
def get_sources(args):
    """ Get the Datasources associated with the specified species at a specified site

        Args:
            args (dict): Dictionary containing site and species keys
        Returns:
            dict: Dictionary of 
    """
    try:
        site = args["site"]
    except KeyError:
        # TODO - created a SiteError error type to raise here
        raise KeyError("Site must be specified")

    try:
        species = args["species"]
    except KeyError:
        raise KeyError("Species must be specified")

    obs = ObsSurface.load()

    datasource_uuids = obs.datasources()
    # Shallow load the Datasources (only get their JSON metadata)
    datasources = [
        Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids
    ]

    matching_sources = [
        d for d in datasources
        if d.search_metadata(search_terms=[site, species], find_all=True)
    ]

    def name_str(d):
        return "_".join([d.species(), d.site(), d.inlet(), d.instrument()])

    unranked = {
        name_str(d): {
            "rank": d.rank(),
            "data_range": d.daterange_str(),
            "uuid": d.uuid()
        }
        for d in matching_sources
    }

    return unranked
Пример #27
0
    def get_sources(self, site, species, data_type):
        """ Get the datasources for this site and species to allow a ranking to be set

            Args:
                site (str): Three letter site code
                species (str): Species name
                data_type (str): Must be valid datatype i.e. CRDS, GC
                See all valid datasources in the DataTypes class
            Returns:
                dict: Dictionary of datasource metadata
        """
        if len(site) != 3 or not valid_site(site):
            # raise InvalidSiteError(f"{site} is not a valid site code")
            raise ValueError(f"{site} is not a valid site code")

        obs = ObsSurface.load()
        datasource_uuids = obs.datasources()

        # Shallow load the Datasources (only get their JSON metadata)
        datasources = [
            Datasource.load(uuid=uuid, shallow=True)
            for uuid in datasource_uuids
        ]

        matching_sources = [
            d for d in datasources
            if d.search_metadata(search_terms=[site, species], find_all=True)
        ]

        def name_str(d):
            return "_".join([d.species(), d.site(), d.inlet(), d.instrument()])

        rank_info = {
            name_str(d): {
                "rank": d.rank(),
                "data_range": d.daterange_str(),
                "uuid": d.uuid()
            }
            for d in matching_sources
        }

        self._before_ranking = copy.deepcopy(rank_info)
        self._key_uuids = {key: rank_info[key]["uuid"] for key in rank_info}

        return rank_info
Пример #28
0
def test_setting_overlapping_dateranges():
    d = Datasource()    

    daterange = "2027-08-01-00:00:00_2027-12-01-00:00:00"

    d.set_rank(rank=1, daterange=daterange)
    
    assert d._rank[1] == ['2027-08-01-00:00:00_2027-12-01-00:00:00']

    daterange_two = "2027-11-01-00:00:00_2028-06-01-00:00:00"

    d.set_rank(rank=1, daterange=daterange_two)
    
    assert d._rank[1] == ['2027-08-01-00:00:00+00:00_2028-06-01-00:00:00+00:00']
Пример #29
0
    def assign_data(self,
                    lookup_results,
                    source_name,
                    data,
                    metadata,
                    overwrite=False):
        """ Assign data to a new or existing Datasource

            Args:
                lookup_results (dict): Results of Datasource lookup
                source_name (str): Name of data source
                data (xarray.Dataset): Data
                metadata (dict): Dictionary of metadata
                overwrite (bool, default=False): Should exisiting data be overwritten
            Returns:
                list: List of Datasource UUIDs
        """
        from HUGS.Modules import Datasource

        uuids = {}
        for key in lookup_results:
            uuid = lookup_results[key]["uuid"]
            name = metadata["name"]

            if uuid:
                datasource = Datasource.load(uuid=uuid)
            else:
                datasource = Datasource(name=name)

            datasource.add_data(metadata=metadata,
                                data=data,
                                data_type="footprint")
            datasource.save()

            uuids[name] = datasource.uuid()

        return uuids
Пример #30
0
def test_update_daterange_replacement(data):
    metadata = {"foo": "bar"}

    d = Datasource(name="foo")

    ch4_data = data["ch4"]["data"]

    d.add_data(metadata=metadata, data=ch4_data)

    assert d._start_datetime == pd.Timestamp("2014-01-30 10:52:30+00:00")
    assert d._end_datetime == pd.Timestamp("2014-01-30 14:20:30+00:00")

    ch4_short = ch4_data.head(40)

    d._data = None

    d.add_data(metadata=metadata, data=ch4_short, overwrite=True)

    assert d._start_datetime == pd.Timestamp("2014-01-30 10:52:30+00:00")
    assert d._end_datetime == pd.Timestamp("2014-01-30 13:22:30+00:00")