def parse_dir(path, depth=12, ext='*.nc', **kwargs): """Retrieve all netCDF under a certain path (including sub-directories and parse their names according to a given pattern (`file_fmt`). Returns a pd.DataFrame of the parsed elements. """ path = Path(path).expanduser() # files = all_files = path.rglob('*.nc') # files = [x.name for x in all_files] cmd = f"find {path.as_posix()} -maxdepth {depth} -iname '{ext}'" find_res = subprocess.run(cmd, shell=True, capture_output=True) all_files = find_res.stdout.decode('utf-8').split() # files = all_files = glob.glob(os.path.join(path, '**/*.nc'), recursive=True) dirs, files = zip(*map(os.path.split, all_files)) # files, ext = zip(*map(os.path.splitext, files)) # guess version by 1st file cmip_version = kwargs.pop('cmip_version', None) if 'file_fmt' in kwargs: file_fmt = kwargs.pop('file_fmt') else: parser = CMIPparser(cmip_version, guess_by=all_files[0]) file_fmt = parser.filename_template # TODO: try-except? Or how can you filter out those files that don't match file_fmt? # TODO: how to take into account gridspec files and normal temporal files in the same directory? # TODO: Check cmip.py at https://github.com/NCAR/intake-esm-datastore/blob/master/builders/cmip.py rev_dict = reverse_formats(file_fmt, files) rev_dict['path'] = all_files return pd.DataFrame.from_dict(rev_dict)
def _open_files(self, files): das = [xr.open_rasterio(f, chunks=self.chunks, **self._kwargs) for f in files] out = xr.concat(das, dim=self.dim) coords = {} if self.pattern: coords = { k: xr.concat( [xr.DataArray( np.full(das[i].sizes.get(self.dim, 1), v), dims=self.dim ) for i, v in enumerate(values)], dim=self.dim) for k, values in reverse_formats(self.pattern, files).items() } return out.assign_coords(**coords).chunk(self.chunks)
def _open_files(self, files): """ This function is called when the data source refers to more than one file either as a list or a glob. It sets up the dask graph for opening the files. Parameters ---------- files : iter List of file objects """ import pandas as pd from xarray import DataArray out = multireader(files, self.chunks, self.concat_dim, **self._kwargs) if not self.pattern: return out coords = {} filenames = [f.path for f in files] field_values = reverse_formats(self.pattern, filenames) if isinstance(self.concat_dim, list): if not set(field_values.keys()).issuperset(set(self.concat_dim)): raise KeyError('All concat_dims should be in pattern.') index = pd.MultiIndex.from_tuples( zip(*(field_values[dim] for dim in self.concat_dim)), names=self.concat_dim) coords = { k: DataArray(v, dims=('dim_0')) for k, v in field_values.items() if k not in self.concat_dim } out = ( out.assign_coords(dim_0=index, **coords) # use the index .unstack().chunk(self.chunks)) # unstack along new index return out.transpose( *self.concat_dim, # reorder dims *filter(lambda x: x not in self.concat_dim, out.dims)) else: coords = { k: DataArray(v, dims=self.concat_dim) for k, v in field_values.items() } return out.assign_coords(**coords).chunk(self.chunks)
def test_roundtrip_reverse_formats(pattern): args = reverse_formats(pattern, paths) for i, path in enumerate(paths): assert pattern.format( **{field: values[i] for field, values in args.items()}) == path