def test_load_dataset(): filename = "WAO-20magl_EUROPE_201306_small.nc" dir_path = os.path.dirname(__file__) test_data = "../data/emissions" filepath = os.path.join(dir_path, test_data, filename) ds = xr.load_dataset(filepath) metadata = {"some": "metadata"} d = Datasource() d.add_data(metadata=metadata, data=ds, data_type="footprints") d.save() keys = d._data_keys["latest"]["keys"] key = list(keys.values())[0] bucket = get_local_bucket() loaded_ds = Datasource.load_dataset(bucket=bucket, key=key) assert loaded_ds.equals(ds)
def test_add_data(data): d = Datasource() metadata = data["ch4"]["metadata"] ch4_data = data["ch4"]["data"] assert ch4_data["ch4"][0] == pytest.approx(1959.55) assert ch4_data["ch4_variability"][0] == pytest.approx(0.79) assert ch4_data["ch4_number_of_observations"][0] == pytest.approx(26.0) d.add_data(metadata=metadata, data=ch4_data, data_type="timeseries") d.save() bucket = get_local_bucket() data_chunks = [ Datasource.load_dataset(bucket=bucket, key=k) for k in d.data_keys() ] # Now read it out and make sure it's what we expect combined = xr.concat(data_chunks, dim="time") assert combined.equals(ch4_data) expected_metadata = { "site": "bsd", "instrument": "picarro", "sampling_period": "60", "inlet": "248m", "port": "9", "type": "air", "network": "decc", "species": "ch4", "scale": "wmo-x2004a", "long_name": "bilsdale", "data_owner": "simon o'doherty", "data_owner_email": "*****@*****.**", "inlet_height_magl": "248m", "comment": "cavity ring-down measurements. output from gcwerks", "source": "in situ measurements of air", "conventions": "cf-1.6", "calibration_scale": "wmo-x2004a", "station_longitude": -1.15033, "station_latitude": 54.35858, "station_long_name": "bilsdale, uk", "station_height_masl": 380.0, "data_type": "timeseries", } assert d.metadata() == expected_metadata
def recombine_datasets( keys: List[str], sort: Optional[bool] = True, attrs_to_check: Union[str, List[str], Dict[str, str], None] = None, ) -> Dataset: """Combines datasets stored separately in the object store into a single dataset Args: keys: List of object store keys sort: Sort the resulting Dataset by the time dimension. Default = True attrs_to_check: Attributes to check for duplicates. If duplicates are present a new data variable will be created containing the values from each dataset If a dictionary is passed, the attribute(s) will be retained and the new value assigned. If a list/string is passed, the attribute(s) will be removed. Returns: xarray.Dataset: Combined Dataset """ from xarray import concat as xr_concat from openghg.store.base import Datasource from openghg.objectstore import get_bucket if not keys: raise ValueError("No data keys passed.") bucket = get_bucket() data = [Datasource.load_dataset(bucket=bucket, key=k) for k in keys] if attrs_to_check is None: attrs_to_check = {"inlet": "multiple"} # For specified attributes (e.g. "inlet") # elevate duplicates to data variables within each Dataset if attrs_to_check: if isinstance(attrs_to_check, dict): attributes = list(attrs_to_check.keys()) replace_values = list(attrs_to_check.values()) elif isinstance(attrs_to_check, str): attributes = [attrs_to_check] replace_values = [""] else: attributes = attrs_to_check replace_values = [""] * len(attributes) data = elevate_duplicate_attrs(ds_list=data, attributes=attributes) # Concatenate datasets along time dimension combined = xr_concat(data, dim="time") # Replace/remove incorrect attributes # - xr.concat will only take value from first dataset if duplicated if attrs_to_check: for attr, value in zip(attributes, replace_values): if attr in combined: # Only update if attr was elevated to a data variable if value: combined.attrs[attr] = value else: combined.attrs.pop(attr) if sort: combined = combined.sortby("time") # Check for duplicates? # This is taken from https://stackoverflow.com/questions/51058379/drop-duplicate-times-in-xarray # _, index = np.unique(combined['time'], return_index=True) # combined = combined.isel(time=index) return combined