class GFDL_GCP_FileDataSourceBase( data_manager.OnTheFlyDirectoryHierarchyQueryMixin, GCPFetchMixin, data_manager.DataframeQueryDataSourceBase ): """Base class for DataSources that access data on GFDL's internal filesystems using GCP, and which may be invoked via frepp. """ _DiagnosticClass = GfdlDiagnostic _PreprocessorClass = preprocessor.DefaultPreprocessor _FileRegexClass = util.abstract_attribute() _DirectoryRegex = util.abstract_attribute() _AttributesClass = util.abstract_attribute() _fetch_method = 'auto' # symlink if not on /archive, else gcp def __init__(self, case_dict, parent): self.catalog = None super(GFDL_GCP_FileDataSourceBase, self).__init__(case_dict, parent) config = core.ConfigManager() self.frepp_mode = config.get('frepp', False) self.dry_run = config.get('dry_run', False) self.timeout = config.get('file_transfer_timeout', 0) if self.frepp_mode: paths = core.PathManager() self.overwrite = True # flag to not overwrite config and .tar: want overwrite for frepp self.file_overwrite = True # if overwrite=False, WK_DIR & OUT_DIR will have been set to a # unique name in parent's init. Set it back so it will be overwritten. d = paths.model_paths(self, overwrite=True) self.MODEL_WK_DIR = d.MODEL_WK_DIR self.MODEL_OUT_DIR = d.MODEL_OUT_DIR
class DaskMultiFilePreprocessor(MDTFPreprocessorBase): """A :class:`MDTFPreprocessorBase` that uses xarray's dask support to preprocessing model data provided as one or several netcdf files per variable. """ _file_preproc_functions = util.abstract_attribute() def __init__(self, data_mgr, pod): super(DaskMultiFilePreprocessor, self).__init__(data_mgr, pod) # initialize PreprocessorFunctionBase objects self.file_preproc_functions = \ [cls_(data_mgr, pod) for cls_ in self._file_preproc_functions] def edit_request(self, data_mgr, pod): """Edit POD's data request, based on the child class's functionality. If the child class has a function that can transform data in format X to format Y and the POD requests X, this method should insert a backup/fallback request for Y. """ for func in self.file_preproc_functions: func.edit_request(data_mgr, pod) super(DaskMultiFilePreprocessor, self).edit_request(data_mgr, pod) def read_dataset(self, var): """Open multi-file Dataset specified by the ``local_data`` attribute of *var*, wrapping xarray `open_mfdataset() <https://xarray.pydata.org/en/stable/generated/xarray.open_mfdataset.html>`__. """ def _file_preproc(ds): for f in self.file_preproc_functions: ds = f.process(var, ds) return ds assert var.local_data if len(var.local_data) == 1: ds = self.read_one_file(var, var.local_data) return _file_preproc(ds) else: assert not var.is_static # just to be safe var.log.debug("Loaded multi-file dataset of %d files:\n%s", len(var.local_data), '\n'.join(4 * ' ' + f"'{f}'" for f in var.local_data), tags=util.ObjectLogTag.IN_FILE) return xr.open_mfdataset( var.local_data, combine="by_coords", # only time-dependent variables and coords are concat'ed: data_vars="minimal", coords="minimal", # all non-concat'ed vars must be the same; global attrs can differ # from file to file; values in ds are taken from first file compat="equals", join="exact", # raise ValueError if non-time dims conflict parallel=True, # use dask preprocess=_file_preproc, **self.open_dataset_kwargs)
class MDTFPreprocessorBase(metaclass=util.MDTFABCMeta): """Base class for preprocessing data after it's been fetched, in order to put it into a format expected by PODs. The only functionality implemented here is parsing data axes and CF attributes; all other functionality is provided by :class:`PreprocessorFunctionBase` functions. """ _functions = util.abstract_attribute() def __init__(self, data_mgr, pod): self.WK_DIR = data_mgr.MODEL_WK_DIR self.convention = data_mgr.convention self.pod_convention = pod.convention # HACK only used for _FillValue workaround in clean_output_encoding self.output_to_ncl = ('ncl' in pod.runtime_requirements) # initialize PreprocessorFunctionBase objects self.functions = [cls_(data_mgr, pod) for cls_ in self._functions] def edit_request(self, data_mgr, pod): """Edit POD's data request, based on the child class's functionality. If the child class has a function that can transform data in format X to format Y and the POD requests X, this method should insert a backup/fallback request for Y. """ for func in self.functions: func.edit_request(data_mgr, pod) # arguments passed to xr.open_dataset and xr.open_mfdataset open_dataset_kwargs = { "engine": "netcdf4", "decode_cf": False, # all decoding done by DatasetParser "decode_coords": False, # so disable it here "decode_times": False, "use_cftime": False } # arguments passed to xr.to_netcdf save_dataset_kwargs = { "engine": "netcdf4", "format": "NETCDF4_CLASSIC" # NETCDF3* not supported by this engine (?) } def read_one_file(self, var, path_list): if len(path_list) != 1: raise ValueError(f"{var.full_name}: Expected one file, got {path_list}.") _log.debug("xr.open_dataset on %s", path_list[0]) return xr.open_dataset( path_list[0], **self.open_dataset_kwargs ) @abc.abstractmethod def read_dataset(self, var): pass # return ds def clean_output_encoding(self, var, ds): """Xarray .to_netcdf raises an error if attributes set on a variable have the same name as those used in its encoding, even if their values are the same. Delete these attributes from the attrs dict prior to writing, after checking equality of values. """ def _clean_dict(obj): name = getattr(obj, 'name', 'dataset') encoding = getattr(obj, 'encoding', dict()) attrs = getattr(obj, 'attrs', dict()) for k,v in encoding.items(): if k in attrs: if isinstance(attrs[k], str) and isinstance(v, str): compare_ = (attrs[k].lower() != v.lower()) else: compare_ = (attrs[k] != v) if compare_ and k.lower() != 'source': _log.warning("Conflict in '%s' attribute of %s: %s != %s.", k, name, v, attrs[k]) del attrs[k] for vv in ds.variables.values(): _clean_dict(vv) _clean_dict(ds) if not getattr(var, 'is_static', True): t_coord = var.T ds_T = ds[t_coord.name] # ensure we set time units in as many places as possible if 'units' in ds_T.attrs and 'units' not in ds_T.encoding: ds_T.encoding['units'] = ds_T.attrs['units'] if t_coord.has_bounds: ds[t_coord.bounds].encoding['units'] = ds_T.encoding['units'] for k, v in ds.variables.items(): # First condition: unset _FillValue attribute for all independent # variables (coordinates and their bounds) as per CF convention but # contrary to xarray default; see # https://github.com/pydata/xarray/issues/1598. # Second condition: 'NaN' not a valid _FillValue in NCL for any # variable; see # https://www.ncl.ucar.edu/Support/talk_archives/2012/1689.html old_fillvalue = v.encoding.get('_FillValue', np.nan) if k != var.translation.name \ or (self.output_to_ncl and np.isnan(old_fillvalue)): v.encoding['_FillValue'] = None if '_FillValue' in v.attrs: del v.attrs['_FillValue'] return ds def write_dataset(self, var, ds): # TODO: remove any netcdf Variables that were present in file (and ds) # but not needed for request path_str = util.abbreviate_path(var.dest_path, self.WK_DIR, '$WK_DIR') _log.info("Writing to %s", path_str) os.makedirs(os.path.dirname(var.dest_path), exist_ok=True) _log.debug("xr.Dataset.to_netcdf on %s", var.dest_path) ds = self.clean_output_encoding(var, ds) if var.is_static: unlimited_dims = [] else: unlimited_dims = [var.T.name] ds.to_netcdf( path=var.dest_path, mode='w', **self.save_dataset_kwargs, unlimited_dims=unlimited_dims ) ds.close() def process(self, var): """Top-level wrapper for doing all preprocessing of data files. """ # load dataset try: ds = self.read_dataset(var) ds = xr_parser.DatasetParser().parse(ds, var) except Exception as exc: raise util.DataPreprocessError((f"Error in read/parse data for " f"{var.full_name}."), var) from exc # execute functions for f in self.functions: try: _log.debug("Preprocess %s: call %s", var.full_name, f.__class__.__name__) ds = f.process(var, ds) except Exception as exc: raise util.DataPreprocessError((f"Preprocessing on {var.full_name} " f"failed at {f.__class__.__name__}."), var) from exc # write dataset try: self.write_dataset(var, ds) except Exception as exc: raise util.DataPreprocessError((f"Error in writing data for " f"{var.full_name}."), var) from exc del ds # shouldn't be necessary _log.debug("Successful preprocessor exit on %s.", var)