def open_mf_wrf_dataset(paths, chunks=None, compat='no_conflicts', lock=None, preprocess=None): """Open multiple WRF files as a single WRF dataset. Requires dask to be installed. Note that if your files are sliced by time, certain diagnostic variable computed out of accumulated variables (e.g. PRCP) won't be available, because not computable lazily. This code is adapted from xarray's open_mfdataset function. The xarray license is reproduced in the salem/licenses directory. Parameters ---------- paths : str or sequence Either a string glob in the form `path/to/my/files/*.nc` or an explicit list of files to open. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks`` . By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see xarray's full documentation for more details. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. lock : False, True or threading.Lock, optional This argument is passed on to :py:func:`dask.array.from_array`. By default, a per-variable lock is used when reading data from netCDF files with the netcdf4 and h5netcdf engines to avoid issues with concurrent access when using dask's multithreaded backend. Returns ------- xarray.Dataset """ if isinstance(paths, basestring): paths = sorted(glob(paths)) if not paths: raise IOError('no files to open') # TODO: current workaround to dask thread problems import dask dask.config.set(scheduler='single-threaded') if lock is None: lock = NETCDF4_PYTHON_LOCK try: datasets = [ open_wrf_dataset(p, chunks=chunks or {}, lock=lock) for p in paths ] except TypeError as err: if 'lock' not in str(err): raise # New xarray backends datasets = [open_wrf_dataset(p, chunks=chunks or {}) for p in paths] orig_datasets = datasets def ds_closer(): for ods in orig_datasets: ods.close() if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] try: combined = xr.combine_nested(datasets, concat_dim='time', compat=compat) except AttributeError: combined = xr.auto_combine(datasets, concat_dim='time', compat=compat) combined.attrs = datasets[0].attrs try: combined.set_close(ds_closer) except AttributeError: from xarray.backends.api import _MultiFileCloser mfc = _MultiFileCloser([ods._file_obj for ods in orig_datasets]) combined._file_obj = mfc # drop accumulated vars if needed (TODO: make this not hard coded) vns = ['PRCP', 'PRCP_C', 'PRCP_NC'] vns = [vn for vn in vns if vn in combined.variables] try: combined = combined.drop_vars(vns) except AttributeError: combined = combined.drop(vns) return combined
def open_mfbpchdataset(paths, concat_dim='time', compat='no_conflicts', preprocess=None, lock=None, **kwargs): """ Open multiple bpch files as a single dataset. You must have dask installed for this to work, as this greatly simplifies issues relating to multi-file I/O. Also, please note that this is not a very performant routine. I/O is still limited by the fact that we need to manually scan/read through each bpch file so that we can figure out what its contents are, since that metadata isn't saved anywhere. So this routine will actually sequentially load Datasets for each bpch file, then concatenate them along the "time" axis. You may wish to simply process each file individually, coerce to NetCDF, and then ingest through xarray as normal. Parameters ---------- paths : list of strs Filenames to load; order doesn't matter as they will be lexicographically sorted before we read in the data concat_dim : str, default='time' Dimension to concatenate Datasets over. We default to "time" since this is how GEOS-Chem splits output files compat : str (optional) String indicating how to compare variables of the same name for potential conflicts when merging: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. preprocess : callable (optional) A pre-processing function to apply to each Dataset prior to concatenation lock : False, True, or threading.Lock (optional) Passed to :py:func:`dask.array.from_array`. By default, xarray employs a per-variable lock when reading data from NetCDF files, but this model has not yet been extended or implemented for bpch files and so this is not actually used. However, it is likely necessary before dask's multi-threaded backend can be used **kwargs : optional Additional arguments to pass to :py:func:`xbpch.open_bpchdataset`. """ from xarray.backends.api import _MultiFileCloser # TODO: Include file locks? # Check for dask dask = kwargs.pop('dask', False) if not dask: raise ValueError( "Reading multiple files without dask is not supported") kwargs['dask'] = True # Add th if isinstance(paths, str): paths = sorted(glob(paths)) if not paths: raise IOError("No paths to files were passed into open_mfbpchdataset") datasets = [open_bpchdataset(filename, **kwargs) for filename in paths] bpch_objs = [ds._file_obj for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] # Concatenate over time combined = xr.auto_combine(datasets, compat=compat, concat_dim=concat_dim) combined._file_obj = _MultiFileCloser(bpch_objs) combined.attrs = datasets[0].attrs ts = get_timestamp() fns_str = " ".join(paths) combined.attrs['history'] = ( "{}: Processed/loaded by xbpch-{} from {}".format(ts, ver, fns_str)) return combined
def open_mf_wrf_dataset(paths, chunks=None, compat='no_conflicts', lock=None, preprocess=None): """Open multiple WRF files as a single WRF dataset. Requires dask to be installed. Note that if your files are sliced by time, certain diagnostic variable computed out of accumulated variables (e.g. PRCP) won't be available, because not computable lazily. This code is adapted from xarray's open_mfdataset function. The xarray license is reproduced in the salem/licenses directory. Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit list of files to open. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see xarray's full documentation for more details. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. lock : False, True or threading.Lock, optional This argument is passed on to :py:func:`dask.array.from_array`. By default, a per-variable lock is used when reading data from netCDF files with the netcdf4 and h5netcdf engines to avoid issues with concurrent access when using dask's multithreaded backend. Returns ------- xarray.Dataset """ if isinstance(paths, basestring): paths = sorted(glob(paths)) if not paths: raise IOError('no files to open') # TODO: current workaround to dask thread problems dask.set_options(get=dask.async.get_sync) if lock is None: lock = _default_lock(paths[0], 'netcdf4') datasets = [open_wrf_dataset(p, chunks=chunks or {}, lock=lock) for p in paths] file_objs = [ds._file_obj for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] # TODO: add compat=compat when xarray 9.0 is out combined = xr.auto_combine(datasets, concat_dim='time') combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs # drop accumulated vars if needed (TODO: make this not hard coded) vns = ['PRCP', 'PRCP_C', 'PRCP_NC'] vns = [vn for vn in vns if vn in combined.variables] combined = combined.drop(vns) return combined
def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, **kwargs): '''Open multiple files as a single dataset. This function is adapted from the xarray function of the same name. The main difference is that instead of failing on files that do not exist, this function keeps processing. Requires dask to be installed. Attributes from the first dataset file are used for the combined dataset. Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit list of files to open. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details. concat_dim : None, str, DataArray or Index, optional Dimension to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only need to provide this argument if the dimension along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for 'netcdf4'. autoclose : bool, optional If True, automatically close files to avoid OS Error of too many files being open. However, this option doesn't work with streams, e.g., BytesIO. lock : False, True or threading.Lock, optional This argument is passed on to :py:func:`dask.array.from_array`. By default, a per-variable lock is used when reading data from netCDF files with the netcdf4 and h5netcdf engines to avoid issues with concurrent access when using dask's multithreaded backend. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. Returns ------- xarray.Dataset See Also -------- auto_combine open_dataset ''' filterwarnings('ignore', 'elementwise comparison failed;') filterwarnings('ignore', 'numpy equal will not check object') if isinstance(paths, basestring): paths = sorted(glob(paths)) if not paths: raise IOError('no files to open') if lock is None: lock = _default_lock(paths[0], engine) datasets = [ _open_dataset(p, engine=engine, chunks=chunks or {}, lock=lock, **kwargs) for p in paths ] file_objs = [ds._file_obj for ds in datasets if ds is not None] if isinstance(concat_dim, pd.Index): name = concat_dim.name concat_dim = concat_dim.take( [ind for ind, ds in enumerate(datasets) if ds is not None]) concat_dim.name = name if preprocess is not None: datasets = [preprocess(ds) for ds in datasets if ds is not None] if concat_dim is _CONCAT_DIM_DEFAULT: combined = auto_combine(datasets, compat=compat) else: combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat) combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs return combined