def test_get_project_base_dir(): cmip5_base_dir = get_project_base_dir("cmip5") assert cmip5_base_dir == "/badc/cmip5/data/cmip5" c3s_cordex_base_dir = get_project_base_dir("c3s-cordex") assert c3s_cordex_base_dir == "/gws/nopw/j04/cp4cds1_vol1/data/c3s-cordex" with pytest.raises(Exception) as exc: get_project_base_dir("test") assert str(exc.value) == "The project supplied is not known."
def populate_dc_store(): scan.get_dc_store = Mock(return_value=char_store) ds_paths = get_dataset_paths("cmip5", ds_ids=ds_ids, paths=get_project_base_dir("cmip5")) for ds_id, ds_path in ds_paths.items(): scan_dataset("cmip5", ds_id, ds_path, "full", "ceda")
def _get_ds_paths_from_paths(paths, project): """ Return an OrderedDict of {<ds_id>: <ds_path>} found under the paths provided as `paths` (a sequence of directory/file paths). :param paths: (sequence) directory/file paths :param project: top-level project, e.g. "cmip5", "cmip6" or "cordex" (case-insensitive) :return: OrderedDict of {<ds_id>: <ds_path>} """ base_dir = get_project_base_dir(project) # Check paths first bad_paths = [] for pth in paths: if not pth.startswith(base_dir): bad_paths.append(pth) if bad_paths: raise Exception(f"Invalid paths provided: {bad_paths}") ds_paths = collections.OrderedDict() for pth in paths: LOGGER.info(f"Searching for datasets under: {pth}") facet_order = CONFIG[f'project:{project}']['facet_rule'] facets_in_path = pth.replace(base_dir, "").strip("/").split("/") facets = {} for i, facet_name in enumerate(facet_order): if len(facets_in_path) <= i: break facets[facet_name] = facets_in_path[i] # Fix facet version if not set if not facets.get("version"): facets["version"] = "latest" facets_as_path = "/".join([facets.get(_, "*") for _ in facet_order]) # Remove anything matching "files" if "/files" in facets_as_path: continue # TODO: This is repet code of below. Suggest we create a module/class # to manage all mapping of different args to resolve to ds_paths dictionary, later. pattern = os.path.join(base_dir, facets_as_path) LOGGER.info(f"Finding dataset paths for pattern: {pattern}") for ds_path in glob.glob(pattern): ds_id = switch_ds.switch_ds(project, ds_path) ds_paths[ds_id] = ds_path return ds_paths
def _load_ids(self): """ Gets list of possible ds_ids from sample_id""" base_dir = get_project_base_dir(self.project) _sample_id = os.path.join(base_dir, "/".join(self.sample_id.split("."))) self._sample = [] for path in glob.glob(_sample_id): self._sample.append(".".join(path.split("/")[-11:])) return self._sample
def _consolidate_dset(dset): if dset.startswith('https'): raise Exception('This format is not supported yet') elif os.path.isfile(dset) or dset.endswith('.nc'): return dset elif os.path.isdir(dset): return os.path.join(dset, '*.nc') elif dset.count('.') > 6: project = get_project_name(dset) base_dir = get_project_base_dir(project) return base_dir.rstrip("/") + "/" + dset.replace(".", "/") + "/*.nc" else: raise Exception(f'The format of {dset} is not known.')
def get_dataset_paths(project, ds_ids=None, paths=None, facets=None, exclude=None): """ Converts the input arguments into an Ordered Dictionary of {DSID: directory} items. :param project: top-level project, e.g. "cmip5", "cmip6" or "cordex" (case-insensitive) :param ds_ids: sequence of dataset identifiers (DSIDs), OR None. :param paths: sequence of file paths to scan for NetCDF files under, OR None. :param facets: dictionary of facet values to limit the search, OR None. :param exclude: list of regular expressions to exclude in file paths, OR None. :return: An Ordered Dictionary of {dsid: directory} """ base_dir = get_project_base_dir(project) ds_paths = collections.OrderedDict() # If ds_ids is defined then ignore all other arguments and use this list if ds_ids: for dsid in ds_ids: if not dsid: continue ds_path = switch_ds.switch_ds(project, dsid) ds_paths[dsid] = ds_path # Else use facets if they exist elif facets: facet_order = CONFIG[f'project:{project}']['facet_rule'] facets_as_path = "/".join([facets.get(_, "*") for _ in facet_order]) pattern = os.path.join(base_dir, facets_as_path) LOGGER.info(f"Finding dataset paths for pattern: {pattern}") for ds_path in glob.glob(pattern): ds_id = switch_ds.switch_ds(project, ds_path) ds_paths[ds_id] = ds_path elif paths: ds_paths = _get_ds_paths_from_paths(paths, project) else: raise NotImplementedError( 'Code currently breaks if not using "ds_ids" argument.') return ds_paths
def switch_ds(project, ds): """ Switches between ds_path and ds_id. :param project: top-level project :param ds: either dataset path or dataset ID (DSID) :return: either dataset path or dataset ID (DSID) - switched from the input. """ base_dir = get_project_base_dir(project) if ds.startswith("/"): return ".".join(ds.replace(base_dir, "").strip("/").split("/")) else: return os.path.join(base_dir, "/".join(ds.split(".")))
import glob import json import os import subprocess import sys import numpy as np import pytest import xarray as xr from roocs_utils.project_utils import get_project_base_dir from dachar.scan import scan from dachar.utils import character base_dir = get_project_base_dir("cmip5") # def test_parser(): # sys.argv = "scan.py -m MOHC/HadGEM2-ES -exp historical -e r1i1p1 -v rh".split() # args = scan.arg_parse() # for model in args.model: # assert model == "MOHC/HadGEM2-ES" # for experiment in args.experiment: # assert experiment == "historical" # for ensemble in args.ensemble: # assert ensemble == "r1i1p1" # for variable in args.var_id: # assert variable == "rh" # # # def test_get_files():