def load_data_catalog(): """Load data on either NCAR or pangeo machine.""" if is_ncar_host(): col = intake.open_esm_datastore("../catalogs/glade-cmip6.json") else: col = intake.open_esm_datastore("../catalogs/pangeo-cmip6.json") return col
def createDataDict(this_experiment_id, this_variable_id, this_table_id, this_grid_label): if util.is_ncar_host(): col = intake.open_esm_datastore("../catalogs/glade-cmip6.json") else: col = intake.open_esm_datastore("../catalogs/pangeo-cmip6.json") cat = col.search(experiment_id=this_experiment_id, \ table_id=this_table_id, \ variable_id=this_variable_id, \ grid_label=this_grid_label) dataset_info = cat.df dset_dict = cat.to_dataset_dict(zarr_kwargs={ 'consolidated': True, 'decode_times': False }, cdf_kwargs={ 'chunks': {}, 'decode_times': False }) #dset_dict.keys() source_ids = cat.df['source_id'] modelnames = list(set(source_ids)) return dataset_info, dset_dict, modelnames
def test_serialize_to_json(): with TemporaryDirectory() as local_store: col = intake.open_esm_datastore(catalog_dict_records) name = 'test_serialize_dict' col.serialize(name=name, directory=local_store, catalog_type='dict') output_catalog = os.path.join(local_store, name + '.json') col2 = intake.open_esm_datastore(output_catalog) pd.testing.assert_frame_equal(col.df, col2.df)
def test_invalid_derivedcat(query, regex): registry = intake_esm.DerivedVariableRegistry() @registry.register(variable='FOO', query=query) def func(ds): ds['FOO'] = ds.FLNS + ds.FLUT return ds with pytest.raises(ValueError, match=regex): intake.open_esm_datastore(catalog_dict_records, registry=registry)
def test_serialize_to_csv(): col = intake.open_esm_datastore(cdf_col_sample_cmip6) with TemporaryDirectory() as local_store: col_subset = col.search(source_id='MRI-ESM2-0', ) name = 'CMIP6-MRI-ESM2-0' col_subset.serialize(name=name, directory=local_store, catalog_type='file') col = intake.open_esm_datastore(f'{local_store}/{name}.json') pd.testing.assert_frame_equal(col_subset.df, col.df) assert col.esmcol_data['id'] == name
def get_cmip6_catalogue(): """ Get full catalogue of CMIP6 data on glade or cloud """ if is_ncar_host(): cmip6_collection = intake.open_esm_datastore( "../../catalogs/glade-cmip6.json") else: cmip6_collection = intake.open_esm_datastore( "../../catalogs/pangeo-cmip6.json") return cmip6_collection
def test_catalog_serialize(tmp_path, catalog_type): cat = intake.open_esm_datastore(cdf_col_sample_cmip6) local_store = tmp_path cat_subset = cat.search(source_id='MRI-ESM2-0', ) name = 'CMIP6-MRI-ESM2-0' cat_subset.serialize(name=name, directory=local_store, catalog_type=catalog_type) cat = intake.open_esm_datastore(f'{local_store}/{name}.json') pd.testing.assert_frame_equal(cat_subset.df.reset_index(drop=True), cat.df.reset_index(drop=True)) assert cat.esmcat.id == name
def test_to_dataset_dict_s3(): pytest.importorskip('s3fs') col = intake.open_esm_datastore(zarr_col_aws_cesm) cat = col.search(variable='RAIN', experiment='20C') dsets = cat.to_dataset_dict(storage_options={'anon': True}) _, ds = dsets.popitem() assert isinstance(ds, xr.Dataset)
def test_to_dataset_dict_aggfalse(esmcol_path, query): col = intake.open_esm_datastore(esmcol_path) cat = col.search(**query) nds = len(cat.df) dsets = cat.to_dataset_dict(zarr_kwargs={'consolidated': True}, aggregate=False) assert len(dsets.keys()) == nds
def test_df_property(): col = intake.open_esm_datastore(catalog_dict_records) assert len(col.df) == 5 col.df = col.df.iloc[0:2, :] assert isinstance(col.df, pd.DataFrame) assert len(col) == 1 assert len(col.df) == 2
def test_getitem(key, decode_times): col = intake.open_esm_datastore(cdf_col_sample_cmip6) x = col[key] assert isinstance(x, intake_esm.source.ESMGroupDataSource) ds = x(cdf_kwargs={'chunks': {}, 'decode_times': decode_times}).to_dask() assert isinstance(ds, xr.Dataset) assert set(x.df['member_id']) == set(ds['member_id'].values)
def test_ipython_key_completions(): col = intake.open_esm_datastore(cdf_col_sample_cmip6) rv = [ 'df', 'to_dataset_dict', 'from_df', 'keys', 'serialize', 'search', 'unique', 'nunique', 'update_aggregation', 'key_template', 'groupby_attrs', 'variable_column_name', 'aggregations', 'agg_columns', 'aggregation_dict', 'path_column_name', 'data_format', 'format_column_name', ] keys = col._ipython_key_completions_() for key in rv: assert key in keys
def test_repr_html(url): col = intake.open_esm_datastore(url) text = col._repr_html_() assert 'unique' in text columns = col.df.columns.tolist() for column in columns: assert column in text
def plot_30_Jahre_Klimatologie_von(var, scen, time, mod='GFDL-ESM4', table_id='Amon'): col = intake.open_esm_datastore( "https://storage.googleapis.com/cmip6/pangeo-cmip6.json") query = dict( experiment_id=[scen], table_id=table_id, variable_id=[var], source_id=mod, #'MPI-ESM1-2-HR', member_id='r1i1p1f1', ) map_data = col.search(require_all_on=["source_id"], **query) from collections import defaultdict dsets = defaultdict(dict) for group, df in map_data.df.groupby(by=['source_id', 'experiment_id']): dsets[group[0]][group[1]] = open_delayed(df) dsets_ = dask.compute(dict(dsets))[0] fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(8, 4), subplot_kw={'projection': ccrs.Robinson()}) (dsets_[mod][scen].sel( time=slice(str(time - 15), str(time + 15))).mean('time'))[var].plot( ax=axes, transform=ccrs.PlateCarree(), cbar_kwargs=dict(shrink=0.5)) axes.coastlines() return ()
def _load_from_intake(experiment_id, table_id, grid_label, variable_id, institution_id, activity_id, member_id): """ Load data from the pangeo CMIP6 intake catalogue.The arguments relate to the CMIP6 parameters of a dataset. The CMIP6 reference is the ESGF servers which can be accessed here: https://esgf-index1.ceda.ac.uk/search/cmip6-ceda/ Function is cahced to reduce remote queries. """ collection = intake.open_esm_datastore(URL) cat = collection.search(experiment_id=experiment_id, table_id=table_id, grid_label=grid_label, institution_id=institution_id, member_id=member_id, variable_id=variable_id) dset_dict = cat.to_dataset_dict(zarr_kwargs={ 'consolidated': True, 'decode_times': False }, cdf_kwargs={ 'chunks': {}, 'decode_times': False }) # The search should have produced a dictionary with only 1 item, so # get that item and get a cube from it. ds_label, xr = dset_dict.popitem() cube = xr[variable_id].to_iris() coord_names = [c1.name() for c1 in cube.coords()] if 'air_pressure' in coord_names: cube.coord('air_pressure').convert_units('hPa') return iris.util.squeeze(cube) # drop member dimension
def load(self, query, catfile=DEFAULT_INTAKE_ESM_CAT, **kwargs): """Loads datasets from given parameters. Parameters ---------- query: dict Key, value pairs used to search the catalogue. Depth of catalog search (default: 5) catfile : str, optional Path to catalogue metadata file, can be a remote URL. The pangeo intake-esm CMIP6 catalogue is used by default. **kwargs : dict, optional Keyword Arguments for `intake_esm.core.esm_datastore.to_dataset_dict()` Returns ------- datasets : list xarray DataArray objects. """ import intake col = intake.open_esm_datastore(catfile) cat = col.search(**query) dset_dict = cat.to_dataset_dict(**kwargs) variable = query.get('variable_id') return self._prep_datasets(variable, dset_dict)
def test_to_xarray_cesm_netcdf(chunks, expected_chunks): c = intake.open_esm_datastore(cdf_col) query = {'variable': ['SHF'], 'member_id': [1, 3, 9], 'experiment': ['20C', 'RCP85']} cat = c.search(**query) dset = cat.to_dataset_dict(cdf_kwargs=dict(chunks=chunks)) _, ds = dset.popitem() assert ds['SHF'].data.chunksize == expected_chunks
def test_to_xarray_cmip(chunks, expected_chunks): c = intake.open_esm_datastore(esmcol_path) cat = c.search(variable=['hfls'], frequency='mon', modeling_realm='atmos', model=['CNRM-CM5']) dset = cat.to_dataset_dict(cdf_kwargs=dict(chunks=chunks)) _, ds = dset.popitem() assert ds['hfls'].data.chunksize == expected_chunks
def test_to_dataset_dict_skip_error(): cat = intake.open_esm_datastore(catalog_dict_records) with pytest.raises(intake_esm.source.ESMDataSourceError): dsets = cat.to_dataset_dict( xarray_open_kwargs={ 'backend_kwargsd': { 'storage_options': { 'anon': True } } }, skip_on_error=False, ) dsets = cat.to_dataset_dict( xarray_open_kwargs={ 'backend_kwargsd': { 'storage_options': { 'anon': True } } }, skip_on_error=True, ) assert len(dsets.keys()) == 0
def create_data_dict(this_experiment_id, this_variable_id, this_table_id, this_grid_label): """Creates data dictionary. Creates a data dictionary for some variable, grid, and table id for the chosen experiment(s). Args: this_experiment_id: The string ID for the experiment. Can be list of strings. this_variable_id; The string ID for this variable (e.g. 'tas'). this_table_id: ID for the table (e.g. 'Amon'). this_grid_label: String label of the reference grid (e.g. 'gn'). Returns: dataset_info: dset_dict: The data dictionary. modelnames: String list of source ids of the models in the dict. """ col = intake.open_esm_datastore(DIR_CATALOG + "pangeo-cmip6.json") cat = col.search(experiment_id=this_experiment_id, table_id=this_table_id, variable_id=this_variable_id, grid_label=this_grid_label) dataset_info = cat.df dset_dict = cat.to_dataset_dict(zarr_kwargs={'consolidated': True, 'decode_times': False}, cdf_kwargs={'chunks': {}, 'decode_times': False}) source_ids = cat.df['source_id'] modelnames = list(set(source_ids)) return dataset_info, dset_dict, modelnames
def test_to_dask(path, query, xarray_open_kwargs): cat = intake.open_esm_datastore(path) cat_sub = cat.search(**query) ds = cat_sub.to_dask(xarray_open_kwargs=xarray_open_kwargs) assert 'member_id' in ds.dims assert len(ds.__dask_keys__()) > 0 assert ds.time.encoding
def get_ERA5_zstore_list(year: str = None) -> list: col = intake.open_esm_datastore( "https://cmip6downscaling.blob.core.windows.net/cmip6/ERA5_catalog.json" ) store_list = list(col.df.zstore) if year is not None: store_list = [s for s in store_list if year in s] return store_list
def test_serialize(): with TemporaryDirectory() as local_store: col = intake.open_esm_datastore( 'https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json' ) col_subset = col.search(source_id='BCC-ESM1', grid_label='gn', table_id='Amon', experiment_id='historical') name = 'cmip6_bcc_esm1' col_subset.serialize(name=name, directory=local_store) col = intake.open_esm_datastore(f'{local_store}/cmip6_bcc_esm1.json') pd.testing.assert_frame_equal(col_subset.df, col.df) assert col._col_data['id'] == name
def test_init(capsys, url): col = intake.open_esm_datastore(url) assert isinstance(col.df, pd.DataFrame) print(repr(col)) # Use pytest-capturing method # https://docs.pytest.org/en/latest/capture.html#accessing-captured-output-from-a-test-function captured = capsys.readouterr() assert 'catalog with' in captured.out
def test_to_aggregations_off(esmcol_path, query): col = intake.open_esm_datastore(esmcol_path) cat = col.search(**query) nds = len(cat.df) cat.groupby_attrs = [] assert len(cat.keys()) == nds assert isinstance(cat._grouped, pd.DataFrame) assert isinstance(col._grouped, pd.core.groupby.generic.DataFrameGroupBy)
def test_progressbar(progressbar): c = intake.open_esm_datastore(cdf_col_sample_cmip5) cat = c.search(variable=['hfls'], frequency='mon', modeling_realm='atmos', model=['CNRM-CM5']) _ = cat.to_dataset_dict(cdf_kwargs=dict(chunks={}), progressbar=progressbar)
def test_catalog_with_registry_search(): cat = intake.open_esm_datastore(catalog_dict_records, registry=registry) new_cat = cat.search(variable='FOO') assert len(cat) == 1 assert len(new_cat) == 1 assert len(cat.derivedcat) == 2 assert len(new_cat.derivedcat) == 1
def test_to_dataset_dict_nocache(esmcol_path, query): col = intake.open_esm_datastore(esmcol_path) cat = col.search(**query) _, ds = cat.to_dataset_dict(zarr_kwargs={'consolidated': True}).popitem() id1 = id(ds) cat = col.search(**query) _, ds = cat.to_dataset_dict(zarr_kwargs={'consolidated': True}).popitem() assert id1 != id(ds)
def test_to_dataset_dict_aggfalse(path, query): col = intake.open_esm_datastore(path) cat = col.search(**query) nds = len(cat.df) dsets = cat.to_dataset_dict(xarray_open_kwargs={'chunks': { 'time': 1 }}, aggregate=False) assert len(dsets.keys()) == nds
def test_to_collection(path, query, xarray_open_kwargs): cat = intake.open_esm_datastore(path) cat_sub = cat.search(**query) coll = cat_sub.to_collection(xarray_open_kwargs=xarray_open_kwargs) _, ds = coll.popitem() assert 'member_id' in ds.dims assert len(ds.__dask_keys__()) > 0 assert ds.time.encoding assert isinstance(coll, xc.Collection)