def open_mf_wrf_dataset(paths, chunks=None, compat='no_conflicts', lock=None, preprocess=None): """Open multiple WRF files as a single WRF dataset. Requires dask to be installed. Note that if your files are sliced by time, certain diagnostic variable computed out of accumulated variables (e.g. PRCP) won't be available, because not computable lazily. This code is adapted from xarray's open_mfdataset function. The xarray license is reproduced in the salem/licenses directory. Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit list of files to open. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see xarray's full documentation for more details. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. lock : False, True or threading.Lock, optional This argument is passed on to :py:func:`dask.array.from_array`. By default, a per-variable lock is used when reading data from netCDF files with the netcdf4 and h5netcdf engines to avoid issues with concurrent access when using dask's multithreaded backend. Returns ------- xarray.Dataset """ if isinstance(paths, basestring): paths = sorted(glob(paths)) if not paths: raise IOError('no files to open') # TODO: current workaround to dask thread problems dask.set_options(get=dask.async.get_sync) if lock is None: lock = _default_lock(paths[0], 'netcdf4') datasets = [open_wrf_dataset(p, chunks=chunks or {}, lock=lock) for p in paths] file_objs = [ds._file_obj for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] # TODO: add compat=compat when xarray 9.0 is out combined = xr.auto_combine(datasets, concat_dim='time') combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs # drop accumulated vars if needed (TODO: make this not hard coded) vns = ['PRCP', 'PRCP_C', 'PRCP_NC'] vns = [vn for vn in vns if vn in combined.variables] combined = combined.drop(vns) return combined
def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT, compat='no_conflicts', preprocess=None, engine=None, lock=None, **kwargs): '''Open multiple files as a single dataset. This function is adapted from the xarray function of the same name. The main difference is that instead of failing on files that do not exist, this function keeps processing. Requires dask to be installed. Attributes from the first dataset file are used for the combined dataset. Parameters ---------- paths : str or sequence Either a string glob in the form "path/to/my/files/*.nc" or an explicit list of files to open. chunks : int or dict, optional Dictionary with keys given by dimension names and values given by chunk sizes. In general, these should divide the dimensions of each dataset. If int, chunk each dimension by ``chunks``. By default, chunks will be chosen to load entire input files into memory at once. This has a major impact on performance: please see the full documentation for more details. concat_dim : None, str, DataArray or Index, optional Dimension to concatenate files along. This argument is passed on to :py:func:`xarray.auto_combine` along with the dataset objects. You only need to provide this argument if the dimension along which you want to concatenate is not a dimension in the original datasets, e.g., if you want to stack a collection of 2D arrays along a third dimension. By default, xarray attempts to infer this argument by examining component files. Set ``concat_dim=None`` explicitly to disable concatenation. compat : {'identical', 'equals', 'broadcast_equals', 'no_conflicts'}, optional String indicating how to compare variables of the same name for potential conflicts when merging: - 'broadcast_equals': all values must be equal when variables are broadcast against each other to ensure common dimensions. - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. - 'no_conflicts': only values which are not null in both datasets must be equal. The returned dataset then contains the combination of all non-null values. preprocess : callable, optional If provided, call this function on each dataset prior to concatenation. engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio'}, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for 'netcdf4'. autoclose : bool, optional If True, automatically close files to avoid OS Error of too many files being open. However, this option doesn't work with streams, e.g., BytesIO. lock : False, True or threading.Lock, optional This argument is passed on to :py:func:`dask.array.from_array`. By default, a per-variable lock is used when reading data from netCDF files with the netcdf4 and h5netcdf engines to avoid issues with concurrent access when using dask's multithreaded backend. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. Returns ------- xarray.Dataset See Also -------- auto_combine open_dataset ''' filterwarnings('ignore', 'elementwise comparison failed;') filterwarnings('ignore', 'numpy equal will not check object') if isinstance(paths, basestring): paths = sorted(glob(paths)) if not paths: raise IOError('no files to open') if lock is None: lock = _default_lock(paths[0], engine) datasets = [ _open_dataset(p, engine=engine, chunks=chunks or {}, lock=lock, **kwargs) for p in paths ] file_objs = [ds._file_obj for ds in datasets if ds is not None] if isinstance(concat_dim, pd.Index): name = concat_dim.name concat_dim = concat_dim.take( [ind for ind, ds in enumerate(datasets) if ds is not None]) concat_dim.name = name if preprocess is not None: datasets = [preprocess(ds) for ds in datasets if ds is not None] if concat_dim is _CONCAT_DIM_DEFAULT: combined = auto_combine(datasets, compat=compat) else: combined = auto_combine(datasets, concat_dim=concat_dim, compat=compat) combined._file_obj = _MultiFileCloser(file_objs) combined.attrs = datasets[0].attrs return combined