def _store(self, met_data: METData) -> None: """Store MET data within a Datasource Here we do some retrieve on the request JSON to make the metadata more easily searchable and of a similar format to Datasources used in other modules of OpenGHG. Args: met_data: Dataset Returns: None """ from openghg.store.base import Datasource metadata = met_data.metadata datasource = Datasource() datasource.add_data(metadata=metadata, data=met_data.data, data_type="met") datasource.save() date_str = f"{metadata['start_date']}_{metadata['end_date']}" name = "_".join((metadata["site"], metadata["network"], date_str)) self._datasource_uuids[datasource.uuid()] = name # Write this updated object back to the object store self.save()
def test_load_dataset(): filename = "WAO-20magl_EUROPE_201306_small.nc" dir_path = os.path.dirname(__file__) test_data = "../data/emissions" filepath = os.path.join(dir_path, test_data, filename) ds = xr.load_dataset(filepath) metadata = {"some": "metadata"} d = Datasource() d.add_data(metadata=metadata, data=ds, data_type="footprints") d.save() keys = d._data_keys["latest"]["keys"] key = list(keys.values())[0] bucket = get_local_bucket() loaded_ds = Datasource.load_dataset(bucket=bucket, key=key) assert loaded_ds.equals(ds)
def test_in_daterange(data): metadata = data["ch4"]["metadata"] data = data["ch4"]["data"] d = Datasource() d._uuid = "test-id-123" d.add_data(metadata=metadata, data=data, data_type="timeseries") d.save() expected_keys = [ "data/uuid/test-id-123/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "data/uuid/test-id-123/v1/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "data/uuid/test-id-123/v1/2016-04-02-06:52:30+00:00_2016-11-02-12:54:30+00:00", "data/uuid/test-id-123/v1/2017-02-18-06:36:30+00:00_2017-12-18-15:41:30+00:00", "data/uuid/test-id-123/v1/2018-02-18-15:42:30+00:00_2018-12-18-15:42:30+00:00", "data/uuid/test-id-123/v1/2019-02-03-17:38:30+00:00_2019-12-09-10:47:30+00:00", "data/uuid/test-id-123/v1/2020-02-01-18:08:30+00:00_2020-12-01-22:31:30+00:00", ] assert d.data_keys() == expected_keys start = pd.Timestamp("2014-1-1") end = pd.Timestamp("2014-2-1") daterange = create_daterange_str(start=start, end=end) dated_keys = d.keys_in_daterange_str(daterange=daterange) assert dated_keys[0].split( "/")[-1] == "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00"
def test_exists(): d = Datasource() d.save() exists = Datasource.exists(datasource_id=d.uuid()) assert exists == True
def test_save_footprint(): bucket = get_local_bucket(empty=True) metadata = {"test": "testing123"} dir_path = os.path.dirname(__file__) test_data = "../data/emissions" filename = "WAO-20magl_EUROPE_201306_downsampled.nc" filepath = os.path.join(dir_path, test_data, filename) data = xr.open_dataset(filepath) datasource = Datasource() datasource.add_data(data=data, metadata=metadata, data_type="footprints") datasource.save() prefix = f"{Datasource._datasource_root}/uuid/{datasource._uuid}" objs = get_object_names(bucket, prefix) datasource_2 = Datasource.load(bucket=bucket, key=objs[0]) date_key = "2013-06-02-00:00:00+00:00_2013-06-30-00:00:00+00:00" data = datasource_2._data[date_key] assert float(data.pressure[0].values) == pytest.approx(1023.971) assert float(data.pressure[2].values) == pytest.approx(1009.940) assert float(data.pressure[-1].values) == pytest.approx(1021.303) assert datasource_2._data_type == "footprints"
def assign_data( data_dict: Dict, lookup_results: Dict, overwrite: bool, data_type: str = "timeseries", ) -> Dict[str, str]: """Assign data to a Datasource. This will either create a new Datasource Create or get an existing Datasource for each gas in the file Args: data_dict: Dictionary containing data and metadata for species lookup_results: Dictionary of lookup results] overwrite: If True overwrite current data stored Returns: dict: Dictionary of UUIDs of Datasources data has been assigned to keyed by species name """ from openghg.store.base import Datasource uuids = {} for key in data_dict: metadata = data_dict[key]["metadata"] data = data_dict[key]["data"] # Our lookup results and gas data have the same keys uuid = lookup_results[key] # TODO - Could this be done somewhere else? It doesn't feel quite right it # being here # Add the read metadata to the Dataset attributes being careful # not to overwrite any attributes that are already there to_add = {k: v for k, v in metadata.items() if k not in data.attrs} data.attrs.update(to_add) # If we have a UUID for this Datasource load the existing object # from the object store if uuid is False: datasource = Datasource() else: datasource = Datasource.load(uuid=uuid) # Add the dataframe to the datasource datasource.add_data(metadata=metadata, data=data, overwrite=overwrite, data_type=data_type) # Save Datasource to object store datasource.save() uuids[key] = datasource.uuid() return uuids
def test_save(mock_uuid2): bucket = get_local_bucket() datasource = Datasource() datasource.add_metadata_key(key="data_type", value="timeseries") datasource.save(bucket) prefix = f"{Datasource._datasource_root}/uuid/{datasource._uuid}" objs = get_object_names(bucket, prefix) assert objs[0].split("/")[-1] == mocked_uuid2
def test_add_data(data): d = Datasource() metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] assert ch4_data["ch4"][0] == pytest.approx(1959.55) assert ch4_data["ch4_variability"][0] == pytest.approx(0.79) assert ch4_data["ch4_number_of_observations"][0] == pytest.approx(26.0) d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries") d.save() bucket = get_local_bucket() data_chunks = [ Datasource.load_dataset(bucket=bucket, key=k) for k in d.data_keys() ] # Now read it out and make sure it's what we expect combined = xr.concat(data_chunks, dim="time") assert combined.equals(ch4_data) expected_metadata = { "site": "bsd", "instrument": "picarro", "sampling_period": "60", "inlet": "248m", "port": "9", "type": "air", "network": "decc", "species": "ch4", "scale": "wmo-x2004a", "long_name": "bilsdale", "data_owner": "simon o'doherty", "data_owner_email": "*****@*****.**", "inlet_height_magl": "248m", "comment": "cavity ring-down measurements. output from gcwerks", "source": "in situ measurements of air", "conventions": "cf-1.6", "calibration_scale": "wmo-x2004a", "station_longitude": -1.15033, "station_latitude": 54.35858, "station_long_name": "bilsdale, uk", "station_height_masl": 380.0, "data_type": "timeseries", } assert d.metadata() == expected_metadata
def test_shallow_then_load_data(data): metadata = data["ch4"]["metadata"] data = data["ch4"]["data"] d = Datasource() d.add_data(metadata=metadata, data=data, data_type="timeseries") d.save() new_d = Datasource.load(uuid=d.uuid(), shallow=True) assert not new_d._data ds_data = new_d.data() assert ds_data ch4_data = ds_data["2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00"] assert ch4_data.time[0] == pd.Timestamp("2014-01-30-11:12:30")
def test_versioning(data): # Take head of data # Then add the full data, check versioning works correctly metadata = {"foo": "bar"} d = Datasource() # Fix the UUID for the tests d._uuid = "4b91f73e-3d57-47e4-aa13-cb28c35d3b3d" ch4_data = data["ch4"]["data"] v1 = ch4_data.head(20) v2 = ch4_data.head(30) v3 = ch4_data.head(40) d.add_data(metadata=metadata, data=v1, data_type="timeseries") d.save() d.add_data(metadata=metadata, data=v2, data_type="timeseries") d.save() d.add_data(metadata=metadata, data=v3, data_type="timeseries") d.save() keys = d.versions() assert keys["v1"]["keys"] == { "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v1/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "2015-01-30-11:12:30+00:00_2015-01-30-11:19:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v1/2015-01-30-11:12:30+00:00_2015-01-30-11:19:30+00:00", } assert keys["v2"]["keys"] == { "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v2/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "2015-01-30-11:12:30+00:00_2015-01-30-11:19:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v2/2015-01-30-11:12:30+00:00_2015-01-30-11:19:30+00:00", "2015-01-30-11:12:30+00:00_2015-11-30-11:17:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v2/2015-01-30-11:12:30+00:00_2015-11-30-11:17:30+00:00", } assert keys["v3"]["keys"] == { "2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v3/2014-01-30-11:12:30+00:00_2014-11-30-11:23:30+00:00", "2015-01-30-11:12:30+00:00_2015-01-30-11:19:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v3/2015-01-30-11:12:30+00:00_2015-01-30-11:19:30+00:00", "2015-01-30-11:12:30+00:00_2015-11-30-11:17:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v3/2015-01-30-11:12:30+00:00_2015-11-30-11:17:30+00:00", "2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v3/2015-01-30-11:12:30+00:00_2015-11-30-11:23:30+00:00", "2016-04-02-06:52:30+00:00_2016-04-02-06:55:30+00:00": "data/uuid/4b91f73e-3d57-47e4-aa13-cb28c35d3b3d/v3/2016-04-02-06:52:30+00:00_2016-04-02-06:55:30+00:00", } assert keys["v3"]["keys"] == keys["latest"]["keys"]
def test_from_data(data): d = Datasource() metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries") d.save() obj_data = d.to_data() bucket = get_local_bucket() # Create a new object with the data from d d_2 = Datasource.from_data(bucket=bucket, data=obj_data, shallow=False) metadata = d_2.metadata() assert metadata["site"] == "bsd" assert metadata["instrument"] == "picarro" assert metadata["sampling_period"] == "60" assert metadata["inlet"] == "248m" assert sorted(d_2.data_keys()) == sorted(d.data_keys()) assert d_2.metadata() == d.metadata()