def test_incorrect_datatype_raises(data): d = Datasource() metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] with pytest.raises(TypeError): d.add_data(metadata=metadata, data=ch4_data, data_type="CRDS")
def test_in_daterange(data): metadata = data["ch4"]["metadata"] data = data["ch4"]["data"] d = Datasource() d._uuid = "test-id-123" d.add_data(metadata=metadata, data=data, data_type="timeseries") d.save() expected_keys = [ "data/uuid/test-id-123/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "data/uuid/test-id-123/v1/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "data/uuid/test-id-123/v1/2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00", "data/uuid/test-id-123/v1/2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00", "data/uuid/test-id-123/v1/2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00", "data/uuid/test-id-123/v1/2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00", "data/uuid/test-id-123/v1/2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00", ] assert d.data_keys() == expected_keys start = pd.Timestamp("2014-1-1") end = pd.Timestamp("2014-2-1") daterange = create_daterange_str(start=start, end=end) dated_keys = d.keys_in_daterange_str(daterange=daterange) assert dated_keys[0].split( "/")[-1] == "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00"
def test_key_date_compare(): d = Datasource() keys = { "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00": "data/uuid/test-uid/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00": "data/uuid/test-uid/v1/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00": "data/uuid/test-uid/v1/2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00", "2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00": "data/uuid/test-uid/v1/2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00", "2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00": "data/uuid/test-uid/v1/2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00", "2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00": "data/uuid/test-uid/v1/2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00", "2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00": "data/uuid/test-uid/v1/2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00", } start = timestamp_tzaware("2014-01-01") end = timestamp_tzaware("2018-01-01") in_date = d.key_date_compare(keys=keys, start_date=start, end_date=end) expected = [ "data/uuid/test-uid/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "data/uuid/test-uid/v1/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "data/uuid/test-uid/v1/2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00", "data/uuid/test-uid/v1/2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00", ] assert in_date == expected start = timestamp_tzaware("2053-01-01") end = timestamp_tzaware("2092-01-01") in_date = d.key_date_compare(keys=keys, start_date=start, end_date=end) assert not in_date error_key = { "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00_2014-11-30-11:23:30+00:00": "broken" } with pytest.raises(ValueError): in_date = d.key_date_compare(keys=error_key, start_date=start, end_date=end)
def delete(self, uuid: str) -> None: """Delete a Datasource with the given UUID This function deletes both the record of the object store in he Args: uuid (str): UUID of Datasource Returns: None """ from openghg.objectstore import delete_object, get_bucket from openghg.store.base import Datasource bucket = get_bucket() # Load the Datasource and get all its keys # iterate over these keys and delete them datasource = Datasource.load(uuid=uuid) data_keys = datasource.raw_keys() for version in data_keys: key_data = data_keys[version]["keys"] for daterange in key_data: key = key_data[daterange] delete_object(bucket=bucket, key=key) # Then delete the Datasource itself key = f"{Datasource._datasource_root}/uuid/{uuid}" delete_object(bucket=bucket, key=key) del self._datasource_uuids[uuid]
def _get_sources_local(self, site: str, species: str) -> Dict: site = verify_site(site=site) # Save these self.site = site self.species = species obs = ObsSurface.load() datasource_uuids = obs.datasources() rank_table = obs.rank_data() # Shallow load the Datasources (only get their JSON metadata) datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) matching_sources = [d for d in datasources if d.search_metadata(site=site, species=species)] if not matching_sources: return {} self._user_info = { d.inlet(): { "rank_data": rank_table.get(d.uuid(), "NA"), "data_range": d.daterange_str(), } for d in matching_sources } self._key_lookup = {d.inlet(): d.uuid() for d in matching_sources} self._needs_update = False return self._user_info
def get_sources(args: Dict) -> Dict: obs = ObsSurface.load() datasource_uuids = obs.datasources() rank_table = obs.rank_data() site = args["site"] species = args["species"] # Shallow load the Datasources (only get their JSON metadata) datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) matching_sources = [ d for d in datasources if d.search_metadata(site=site, species=species) ] if not matching_sources: return {} user_info = { d.inlet(): { "rank_data": rank_table.get(d.uuid(), "NA"), "data_range": d.daterange_str(), } for d in matching_sources } key_lookup = {d.inlet(): d.uuid() for d in matching_sources} return {"user_info": user_info, "key_lookup": key_lookup}
def test_read_noaa_obspack(): data_filepath = get_datapath( filename="ch4_esp_surface-flask_2_representative.nc", data_type="NOAA") results = ObsSurface.read_file(filepath=data_filepath, inlet="flask", data_type="NOAA", site="esp", network="NOAA", overwrite=True) uuid = results["processed"]["ch4_esp_surface-flask_2_representative.nc"][ "ch4"] ch4_data = Datasource.load(uuid=uuid, shallow=False).data() assert sorted(list(ch4_data.keys())) == [ "1993-06-17-00:12:30+00:00_1993-11-20-21:50:00+00:00", "1994-01-02-22:10:00+00:00_1994-12-24-22:15:00+00:00", "1995-02-06-12:00:00+00:00_1995-11-08-19:55:00+00:00", "1996-01-21-22:10:00+00:00_1996-12-01-20:00:00+00:00", "1997-02-12-19:00:00+00:00_1997-12-20-20:15:00+00:00", "1998-01-01-23:10:00+00:00_1998-12-31-19:50:00+00:00", "1999-01-14-22:15:00+00:00_1999-12-31-23:35:00+00:00", "2000-03-05-00:00:00+00:00_2000-11-04-22:30:00+00:00", "2001-01-05-21:45:00+00:00_2001-12-06-12:00:00+00:00", "2002-01-12-12:00:00+00:00_2002-01-12-12:00:00+00:00", ] data = ch4_data["1998-01-01-23:10:00+00:00_1998-12-31-19:50:00+00:00"] assert data.time[0] == Timestamp("1998-01-01T23:10:00") assert data["ch4"][0] == pytest.approx(1.83337e-06) assert data["ch4_number_of_observations"][0] == 2.0 assert data["ch4_variability"][0] == pytest.approx(2.093036e-09)
def test_read_cranfield(): get_local_bucket(empty=True) data_filepath = get_datapath(filename="THB_hourly_means_test.csv", data_type="Cranfield_CRDS") results = ObsSurface.read_file(filepath=data_filepath, data_type="CRANFIELD", site="THB", network="CRANFIELD") expected_keys = ["ch4", "co", "co2"] assert sorted(results["processed"] ["THB_hourly_means_test.csv"].keys()) == expected_keys uuid = results["processed"]["THB_hourly_means_test.csv"]["ch4"] ch4_data = Datasource.load(uuid=uuid, shallow=False).data() ch4_data = ch4_data["2018-05-05-00:00:00+00:00_2018-05-13-16:00:00+00:00"] assert ch4_data.time[0] == Timestamp("2018-05-05") assert ch4_data.time[-1] == Timestamp("2018-05-13T16:00:00") assert ch4_data["ch4"][0] == pytest.approx(2585.651) assert ch4_data["ch4"][-1] == pytest.approx(1999.018) assert ch4_data["ch4 variability"][0] == pytest.approx(75.50218) assert ch4_data["ch4 variability"][-1] == pytest.approx(6.48413)
def test_read_thames_barrier(): get_local_bucket(empty=True) data_filepath = get_datapath(filename="thames_test_20190707.csv", data_type="THAMESBARRIER") results = ObsSurface.read_file(filepath=data_filepath, data_type="THAMESBARRIER", site="TMB", network="LGHG", sampling_period=3600) expected_keys = sorted(["CH4", "CO2", "CO"]) assert sorted(list(results["processed"] ["thames_test_20190707.csv"].keys())) == expected_keys uuid = results["processed"]["thames_test_20190707.csv"]["CO2"] data = Datasource.load(uuid=uuid, shallow=False).data() data = data["2019-07-01-00:39:55+00:00_2019-08-01-00:10:30+00:00"] assert data.time[0] == Timestamp("2019-07-01T00:39:55") assert data.time[-1] == Timestamp("2019-08-01T00:10:30") assert data["co2"][0] == pytest.approx(417.97344761) assert data["co2"][-1] == pytest.approx(417.80000653) assert data["co2_variability"][0] == 0 assert data["co2_variability"][-1] == 0 obs = ObsSurface.load() assert sorted(obs._datasource_uuids.values()) == expected_keys
def test_delete_Datasource(): bucket = get_local_bucket(empty=True) data_filepath = get_datapath(filename="thames_test_20190707.csv", data_type="THAMESBARRIER") ObsSurface.read_file(filepath=data_filepath, data_type="THAMESBARRIER", site="tmb", network="LGHG", sampling_period=60) obs = ObsSurface.load() datasources = obs.datasources() uuid = datasources[0] datasource = Datasource.load(uuid=uuid) data_keys = datasource.data_keys() key = data_keys[0] assert exists(bucket=bucket, key=key) obs.delete(uuid=uuid) assert uuid not in obs.datasources() assert not exists(bucket=bucket, key=key)
def search( self, site: str, network: str, start_date: Union[str, Timestamp], end_date: Union[str, Timestamp], ) -> Union[METData, None]: """Search the stored MET data Args: site: Site code network: Network name start_date: Start date end_date: End date Returns: METData or None: METData object if found else None """ from openghg.store.base import Datasource from openghg.dataobjects import METData datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in self._datasource_uuids) # We should only get one datasource here currently for datasource in datasources: if datasource.search_metadata(site=site, network=network, find_all=True): if datasource.in_daterange(start_date=start_date, end_date=end_date): data = next(iter(datasource.data().values())) return METData(data=data, metadata=datasource.metadata()) return None
def query_store() -> Dict: """Create a dictionary that can be used to visualise the object store Returns: dict: Dictionary for data to be shown in force graph """ from openghg.store.base import Datasource from openghg.store import ObsSurface obs = ObsSurface.load() datasource_uuids = obs.datasources() datasources = (Datasource.load(uuid=uuid, shallow=True) for uuid in datasource_uuids) data = {} for d in datasources: metadata = d.metadata() result = { "site": metadata["site"], "species": metadata["species"], "instrument": metadata.get("instrument", "Unknown"), "network": metadata.get("network", "Unknown"), "inlet": metadata.get("inlet", "Unknown"), } data[d.uuid()] = result return data
def test_to_data(data): d = Datasource() metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries") obj_data = d.to_data() metadata = obj_data["metadata"] assert metadata["site"] == "bsd" assert metadata["instrument"] == "picarro" assert metadata["sampling_period"] == "60" assert metadata["inlet"] == "248m" assert metadata["data_type"] == "timeseries" assert len(obj_data["data_keys"]) == 0
def test_get_dataframe_daterange(): n_days = 100 epoch = datetime.datetime(1970, 1, 1, 1, 1) random_data = pd.DataFrame( data=np.random.randint(0, 100, size=(100, 4)), index=pd.date_range(epoch, epoch + datetime.timedelta(n_days - 1), freq="D"), columns=list("ABCD"), ) d = Datasource() start, end = d.get_dataframe_daterange(random_data) assert start == pd.Timestamp("1970-01-01 01:01:00+0000") assert end == pd.Timestamp("1970-04-10 01:01:00+0000")
def test_load_dataset(): filename = "WAO-20magl_EUROPE_201306_small.nc" dir_path = os.path.dirname(__file__) test_data = "../data/emissions" filepath = os.path.join(dir_path, test_data, filename) ds = xr.load_dataset(filepath) metadata = {"some": "metadata"} d = Datasource() d.add_data(metadata=metadata, data=ds, data_type="footprints") d.save() keys = d._data_keys["latest"]["keys"] key = list(keys.values())[0] bucket = get_local_bucket() loaded_ds = Datasource.load_dataset(bucket=bucket, key=key) assert loaded_ds.equals(ds)
def test_save_footprint(): bucket = get_local_bucket(empty=True) metadata = {"test": "testing123"} dir_path = os.path.dirname(__file__) test_data = "../data/emissions" filename = "WAO-20magl_EUROPE_201306_downsampled.nc" filepath = os.path.join(dir_path, test_data, filename) data = xr.open_dataset(filepath) datasource = Datasource() datasource.add_data(data=data, metadata=metadata, data_type="footprints") datasource.save() prefix = f"{Datasource._datasource_root}/uuid/{datasource._uuid}" objs = get_object_names(bucket, prefix) datasource_2 = Datasource.load(bucket=bucket, key=objs[0]) date_key = "2013-06-02-00:00:00+00:00_2013-06-30-00:00:00+00:00" data = datasource_2._data[date_key] assert float(data.pressure[0].values) == pytest.approx(1023.971) assert float(data.pressure[2].values) == pytest.approx(1009.940) assert float(data.pressure[-1].values) == pytest.approx(1021.303) assert datasource_2._data_type == "footprints"
def _store(self, met_data: METData) -> None: """Store MET data within a Datasource Here we do some retrieve on the request JSON to make the metadata more easily searchable and of a similar format to Datasources used in other modules of OpenGHG. Args: met_data: Dataset Returns: None """ from openghg.store.base import Datasource metadata = met_data.metadata datasource = Datasource() datasource.add_data(metadata=metadata, data=met_data.data, data_type="met") datasource.save() date_str = f"{metadata['start_date']}_{metadata['end_date']}" name = "_".join((metadata["site"], metadata["network"], date_str)) self._datasource_uuids[datasource.uuid()] = name # Write this updated object back to the object store self.save()
def test_search_metadata_find_all(): d = Datasource() d._metadata = {"inlet": "100m", "instrument": "violin", "car": "toyota"} result = d.search_metadata(inlet="100m", instrument="violin", car="toyota", find_all=True) assert result is True result = d.search_metadata(inlet="100m", instrument="violin", car="subaru", find_all=True) assert result is False
def test_add_data(data): d = Datasource() metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] assert ch4_data["ch4"][0] == pytest.approx(1959.55) assert ch4_data["ch4_variability"][0] == pytest.approx(0.79) assert ch4_data["ch4_number_of_observations"][0] == pytest.approx(26.0) d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries") d.save() bucket = get_local_bucket() data_chunks = [ Datasource.load_dataset(bucket=bucket, key=k) for k in d.data_keys() ] # Now read it out and make sure it's what we expect combined = xr.concat(data_chunks, dim="time") assert combined.equals(ch4_data) expected_metadata = { "site": "bsd", "instrument": "picarro", "sampling_period": "60", "inlet": "248m", "port": "9", "type": "air", "network": "decc", "species": "ch4", "scale": "wmo-x2004a", "long_name": "bilsdale", "data_owner": "simon o'doherty", "data_owner_email": "*****@*****.**", "inlet_height_magl": "248m", "comment": "cavity ring-down measurements. output from gcwerks", "source": "in situ measurements of air", "conventions": "cf-1.6", "calibration_scale": "wmo-x2004a", "station_longitude": -1.15033, "station_latitude": 54.35858, "station_long_name": "bilsdale, uk", "station_height_masl": 380.0, "data_type": "timeseries", } assert d.metadata() == expected_metadata
def assign_data( data_dict: Dict, lookup_results: Dict, overwrite: bool, data_type: str = "timeseries", ) -> Dict[str, str]: """Assign data to a Datasource. This will either create a new Datasource Create or get an existing Datasource for each gas in the file Args: data_dict: Dictionary containing data and metadata for species lookup_results: Dictionary of lookup results] overwrite: If True overwrite current data stored Returns: dict: Dictionary of UUIDs of Datasources data has been assigned to keyed by species name """ from openghg.store.base import Datasource uuids = {} for key in data_dict: metadata = data_dict[key]["metadata"] data = data_dict[key]["data"] # Our lookup results and gas data have the same keys uuid = lookup_results[key] # TODO - Could this be done somewhere else? It doesn't feel quite right it # being here # Add the read metadata to the Dataset attributes being careful # not to overwrite any attributes that are already there to_add = {k: v for k, v in metadata.items() if k not in data.attrs} data.attrs.update(to_add) # If we have a UUID for this Datasource load the existing object # from the object store if uuid is False: datasource = Datasource() else: datasource = Datasource.load(uuid=uuid) # Add the dataframe to the datasource datasource.add_data(metadata=metadata, data=data, overwrite=overwrite, data_type=data_type) # Save Datasource to object store datasource.save() uuids[key] = datasource.uuid() return uuids
def test_search_metadata_finds_recursively(): d = Datasource() d._metadata = { "car": "toyota", "inlets": { "inlet_a": "45m", "inlet_b": "3580m" } } result = d.search_metadata(search_terms=["45m", "3580m", "toyota"], find_all=True) assert result is True result = d.search_metadata( search_terms=["100m", "violin", "toyota", "swallow"], find_all=True) assert result is False result = d.search_metadata( search_terms=["100m", "violin", "toyota", "swallow"], find_all=False) assert result is True
def test_exists(): d = Datasource() d.save() exists = Datasource.exists(datasource_id=d.uuid()) assert exists == True
def test_search_metadata(): d = Datasource() d._metadata = {"unladen": "swallow", "spam": "eggs"} assert d.search_metadata(unladen="swallow") == True assert d.search_metadata(spam="eggs") == True assert d.search_metadata(unladen="Swallow") == True assert d.search_metadata(giraffe="beans") == False assert d.search_metadata(bird="flamingo") == False
def test_from_data(data): d = Datasource() metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries") d.save() obj_data = d.to_data() bucket = get_local_bucket() # Create a new object with the data from d d_2 = Datasource.from_data(bucket=bucket, data=obj_data, shallow=False) metadata = d_2.metadata() assert metadata["site"] == "bsd" assert metadata["instrument"] == "picarro" assert metadata["sampling_period"] == "60" assert metadata["inlet"] == "248m" assert sorted(d_2.data_keys()) == sorted(d.data_keys()) assert d_2.metadata() == d.metadata()
def test_save(mock_uuid2): bucket = get_local_bucket() datasource = Datasource() datasource.add_metadata_key(key="data_type", value="timeseries") datasource.save(bucket) prefix = f"{Datasource._datasource_root}/uuid/{datasource._uuid}" objs = get_object_names(bucket, prefix) assert objs[0].split("/")[-1] == mocked_uuid2
def test_shallow_then_load_data(data): metadata = data["ch4"]["metadata"] data = data["ch4"]["data"] d = Datasource() d.add_data(metadata=metadata, data=data, data_type="timeseries") d.save() new_d = Datasource.load(uuid=d.uuid(), shallow=True) assert not new_d._data ds_data = new_d.data() assert ds_data ch4_data = ds_data["2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00"] assert ch4_data.time[0] == pd.Timestamp("2014-01-30-11:12:30")
def test_read_beaco2n(): data_filepath = get_datapath(filename="Charlton_Community_Center.csv", data_type="BEACO2N") results = ObsSurface.read_file(filepath=data_filepath, data_type="BEACO2N", site="CCC", network="BEACO2N", overwrite=True) uuid = results["processed"]["Charlton_Community_Center.csv"]["co2"] co2_data = Datasource.load(uuid=uuid, shallow=False).data() co2_data = co2_data["2015-04-18-04:00:00+00:00_2015-04-18-10:00:00+00:00"] assert co2_data.time[0] == Timestamp("2015-04-18T04:00:00") assert co2_data["co2"][0] == 410.4 assert co2_data["co2_qc"][0] == 2
def test_read_multiside_aqmesh(): datafile = get_datapath(filename="co2_data.csv", data_type="AQMESH") metafile = get_datapath(filename="co2_metadata.csv", data_type="AQMESH") result = ObsSurface.read_multisite_aqmesh(data_filepath=datafile, metadata_filepath=metafile, overwrite=True) # This crazy structure will be fixed when add_datsources is updated raith_uuid = result["raith"]["raith"] d = Datasource.load(uuid=raith_uuid, shallow=False) data = d.data()["2021-06-18-05:00:00+00:00_2021-06-21-13:00:00+00:00"] data.time[0] == Timestamp("2021-06-18T05:00:00") data.co2[0] == 442.64 data.time[-1] == Timestamp("2021-06-21T13:00:00") data.co2[-1] == 404.84 expected_attrs = { "site": "raith", "pod_id": 39245, "start_date": "2021-06-15 01:00:00", "end_date": "2021-10-04 00:59:00", "relocate_date": "NA", "long_name": "Raith", "borough": "Glasgow", "site_type": "Roadside", "in_ulez": "No", "latitude": 55.798813, "longitude": -4.058363, "inlet": 1, "network": "aqmesh_glasgow", "sampling_period": "NOT_SET", "species": "co2", "units": "ppm", } assert data.attrs == expected_attrs
def test_update_daterange_replacement(data): metadata = {"foo": "bar"} d = Datasource() ch4_data = data["ch4"]["data"] d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries") assert d._start_date == pd.Timestamp("2014-01-30 11:12:30+00:00") assert d._end_date == pd.Timestamp("2020-12-01 22:31:30+00:00") ch4_short = ch4_data.head(40) d._data = None d.add_data(metadata=metadata, data=ch4_short, data_type="timeseries") assert d._start_date == pd.Timestamp("2014-01-30 11:12:30+00:00") assert d._end_date == pd.Timestamp("2016-04-02 06:55:30+00:00")
def test_dated_metadata_search(): d = Datasource() start = pd.Timestamp("2001-01-01-00:00:00", tz="UTC") end = pd.Timestamp("2001-03-01-00:00:00", tz="UTC") d._start_date = start d._end_date = end d._metadata = {"inlet": "100m", "instrument": "violin", "site": "timbuktu"} assert d.search_metadata(inlet="100m", instrument="violin") == True assert (d.search_metadata( search_terms=["100m", "violin"], start_date=pd.Timestamp("2015-01-01"), end_date=pd.Timestamp("2021-01-01"), ) == False) assert (d.search_metadata( inlet="100m", instrument="violin", start_date=pd.Timestamp("2001-01-01"), end_date=pd.Timestamp("2002-01-01"), ) == True)