예제 #1
0
class GFDL_GCP_FileDataSourceBase(
    data_manager.OnTheFlyDirectoryHierarchyQueryMixin,
    GCPFetchMixin,
    data_manager.DataframeQueryDataSourceBase
):
    """Base class for DataSources that access data on GFDL's internal filesystems
    using GCP, and which may be invoked via frepp.
    """
    _DiagnosticClass = GfdlDiagnostic
    _PreprocessorClass = preprocessor.DefaultPreprocessor

    _FileRegexClass = util.abstract_attribute()
    _DirectoryRegex = util.abstract_attribute()
    _AttributesClass = util.abstract_attribute()
    _fetch_method = 'auto' # symlink if not on /archive, else gcp

    def __init__(self, case_dict, parent):
        self.catalog = None
        super(GFDL_GCP_FileDataSourceBase, self).__init__(case_dict, parent)

        config = core.ConfigManager()
        self.frepp_mode = config.get('frepp', False)
        self.dry_run = config.get('dry_run', False)
        self.timeout = config.get('file_transfer_timeout', 0)

        if self.frepp_mode:
            paths = core.PathManager()
            self.overwrite = True
            # flag to not overwrite config and .tar: want overwrite for frepp
            self.file_overwrite = True
            # if overwrite=False, WK_DIR & OUT_DIR will have been set to a
            # unique name in parent's init. Set it back so it will be overwritten.
            d = paths.model_paths(self, overwrite=True)
            self.MODEL_WK_DIR = d.MODEL_WK_DIR
            self.MODEL_OUT_DIR = d.MODEL_OUT_DIR
예제 #2
0
class DaskMultiFilePreprocessor(MDTFPreprocessorBase):
    """A :class:`MDTFPreprocessorBase` that uses xarray's dask support to
    preprocessing model data provided as one or several netcdf files per
    variable.
    """
    _file_preproc_functions = util.abstract_attribute()

    def __init__(self, data_mgr, pod):
        super(DaskMultiFilePreprocessor, self).__init__(data_mgr, pod)
        # initialize PreprocessorFunctionBase objects
        self.file_preproc_functions = \
            [cls_(data_mgr, pod) for cls_ in self._file_preproc_functions]

    def edit_request(self, data_mgr, pod):
        """Edit POD's data request, based on the child class's functionality. If
        the child class has a function that can transform data in format X to
        format Y and the POD requests X, this method should insert a
        backup/fallback request for Y.
        """
        for func in self.file_preproc_functions:
            func.edit_request(data_mgr, pod)
        super(DaskMultiFilePreprocessor, self).edit_request(data_mgr, pod)

    def read_dataset(self, var):
        """Open multi-file Dataset specified by the ``local_data`` attribute of
        *var*, wrapping xarray `open_mfdataset()
        <https://xarray.pydata.org/en/stable/generated/xarray.open_mfdataset.html>`__.
        """
        def _file_preproc(ds):
            for f in self.file_preproc_functions:
                ds = f.process(var, ds)
            return ds

        assert var.local_data
        if len(var.local_data) == 1:
            ds = self.read_one_file(var, var.local_data)
            return _file_preproc(ds)
        else:
            assert not var.is_static  # just to be safe
            var.log.debug("Loaded multi-file dataset of %d files:\n%s",
                          len(var.local_data),
                          '\n'.join(4 * ' ' + f"'{f}'"
                                    for f in var.local_data),
                          tags=util.ObjectLogTag.IN_FILE)
            return xr.open_mfdataset(
                var.local_data,
                combine="by_coords",
                # only time-dependent variables and coords are concat'ed:
                data_vars="minimal",
                coords="minimal",
                # all non-concat'ed vars must be the same; global attrs can differ
                # from file to file; values in ds are taken from first file
                compat="equals",
                join="exact",  # raise ValueError if non-time dims conflict
                parallel=True,  # use dask
                preprocess=_file_preproc,
                **self.open_dataset_kwargs)
class MDTFPreprocessorBase(metaclass=util.MDTFABCMeta):
    """Base class for preprocessing data after it's been fetched, in order to 
    put it into a format expected by PODs. The only functionality implemented 
    here is parsing data axes and CF attributes; all other functionality is 
    provided by :class:`PreprocessorFunctionBase` functions.
    """
    _functions = util.abstract_attribute()

    def __init__(self, data_mgr, pod):
        self.WK_DIR = data_mgr.MODEL_WK_DIR
        self.convention = data_mgr.convention
        self.pod_convention = pod.convention

        # HACK only used for _FillValue workaround in clean_output_encoding
        self.output_to_ncl = ('ncl' in pod.runtime_requirements)

        # initialize PreprocessorFunctionBase objects
        self.functions = [cls_(data_mgr, pod) for cls_ in self._functions]

    def edit_request(self, data_mgr, pod):
        """Edit POD's data request, based on the child class's functionality. If
        the child class has a function that can transform data in format X to 
        format Y and the POD requests X, this method should insert a 
        backup/fallback request for Y.
        """
        for func in self.functions:
            func.edit_request(data_mgr, pod)

    # arguments passed to xr.open_dataset and xr.open_mfdataset
    open_dataset_kwargs = {
        "engine": "netcdf4",
        "decode_cf": False,     # all decoding done by DatasetParser
        "decode_coords": False, # so disable it here
        "decode_times": False,
        "use_cftime": False
    }
    # arguments passed to xr.to_netcdf
    save_dataset_kwargs = {
        "engine": "netcdf4",
        "format": "NETCDF4_CLASSIC" # NETCDF3* not supported by this engine (?)
    }

    def read_one_file(self, var, path_list):
        if len(path_list) != 1:
            raise ValueError(f"{var.full_name}: Expected one file, got {path_list}.")
        _log.debug("xr.open_dataset on %s", path_list[0])
        return xr.open_dataset(
            path_list[0], 
            **self.open_dataset_kwargs
        )

    @abc.abstractmethod
    def read_dataset(self, var):
        pass # return ds

    def clean_output_encoding(self, var, ds):
        """Xarray .to_netcdf raises an error if attributes set on a variable have
        the same name as those used in its encoding, even if their values are the
        same. Delete these attributes from the attrs dict prior to writing, after
        checking equality of values.
        """
        def _clean_dict(obj):
            name = getattr(obj, 'name', 'dataset')
            encoding = getattr(obj, 'encoding', dict())
            attrs = getattr(obj, 'attrs', dict())
            for k,v in encoding.items():
                if k in attrs:
                    if isinstance(attrs[k], str) and isinstance(v, str):
                        compare_ = (attrs[k].lower() != v.lower())
                    else:
                        compare_ = (attrs[k] != v)
                    if compare_ and k.lower() != 'source':
                        _log.warning("Conflict in '%s' attribute of %s: %s != %s.",
                            k, name, v, attrs[k])
                    del attrs[k]   
            
        for vv in ds.variables.values():
            _clean_dict(vv)
        _clean_dict(ds)

        if not getattr(var, 'is_static', True):
            t_coord = var.T
            ds_T = ds[t_coord.name]
            # ensure we set time units in as many places as possible
            if 'units' in ds_T.attrs and 'units' not in ds_T.encoding:
                ds_T.encoding['units'] = ds_T.attrs['units']
            if t_coord.has_bounds:
                ds[t_coord.bounds].encoding['units'] = ds_T.encoding['units']

        for k, v in ds.variables.items():
            # First condition: unset _FillValue attribute for all independent 
            # variables (coordinates and their bounds) as per CF convention but 
            # contrary to xarray default; see 
            # https://github.com/pydata/xarray/issues/1598.
            # Second condition: 'NaN' not a valid _FillValue in NCL for any
            # variable; see 
            # https://www.ncl.ucar.edu/Support/talk_archives/2012/1689.html
            old_fillvalue = v.encoding.get('_FillValue', np.nan)
            if k != var.translation.name \
                or (self.output_to_ncl and np.isnan(old_fillvalue)):
                v.encoding['_FillValue'] = None
                if '_FillValue' in v.attrs:
                    del v.attrs['_FillValue']
        return ds

    def write_dataset(self, var, ds):
        # TODO: remove any netcdf Variables that were present in file (and ds) 
        # but not needed for request
        path_str = util.abbreviate_path(var.dest_path, self.WK_DIR, '$WK_DIR')
        _log.info("Writing to %s", path_str)
        os.makedirs(os.path.dirname(var.dest_path), exist_ok=True)
        _log.debug("xr.Dataset.to_netcdf on %s", var.dest_path)
        ds = self.clean_output_encoding(var, ds)
        if var.is_static:
            unlimited_dims = []
        else:
            unlimited_dims = [var.T.name]

        ds.to_netcdf(
            path=var.dest_path,
            mode='w',
            **self.save_dataset_kwargs,
            unlimited_dims=unlimited_dims
        )
        ds.close()

    def process(self, var):
        """Top-level wrapper for doing all preprocessing of data files.
        """
        # load dataset
        try:
            ds = self.read_dataset(var)
            ds = xr_parser.DatasetParser().parse(ds, var)
        except Exception as exc:
            raise util.DataPreprocessError((f"Error in read/parse data for "
                f"{var.full_name}."), var) from exc
        # execute functions
        for f in self.functions:
            try:
                _log.debug("Preprocess %s: call %s", var.full_name, f.__class__.__name__)
                ds = f.process(var, ds)
            except Exception as exc:
                raise util.DataPreprocessError((f"Preprocessing on {var.full_name} "
                    f"failed at {f.__class__.__name__}."), var) from exc
        # write dataset
        try:
            self.write_dataset(var, ds)
        except Exception as exc:
            raise util.DataPreprocessError((f"Error in writing data for "
                f"{var.full_name}."), var) from exc
        del ds # shouldn't be necessary
        _log.debug("Successful preprocessor exit on %s.", var)