def read_hdf4(filename, name=None, coords_only=False, **kwargs): import pyhdf.SD as SD from pyhdf.error import HDF4Error try: _file = SD.SD(filename) except HDF4Error: print("Cannot open file: %s" % filename) raise # find out which dataset to read if name is None: datasets = list(_file.datasets().keys()) variables = [] for d in datasets: var = _file.select(d) if len(var.dimensions()) > 1 or var.dim(0).info()[0] != d: variables.append(d) if len(variables) > 1: raise AttributeError( "There is more than one non-coordinate " "variable in the file, and you didn't " "specify which one you want me to read!" ) name = variables[0] # open dataset sds = _file.select(name) # open the coordinate variables dims = sds.dimensions(full=True) dimorder = {dims[k][1]: k for k in list(dims.keys())} coordinates = OrderedDict() for d in range(len(dimorder)): coordinates[dimorder[d]] = _file.select(dimorder[d])[:] # coordinate slicing slices = get_coordinate_slices(coordinates, kwargs) # slice the coordinate arrays themselves for i, c in enumerate(list(coordinates.keys())): coordinates[c] = coordinates[c][slices[i]] if coords_only: return coordinates # read requested slice from disk data = sds[slices] fill = sds.getfillvalue() if fill is not None and not np.isnan(fill): data = np.where(data != fill, data, np.nan) # make sure latitudes go from S to N if coordinates["latitude"][0] > coordinates["latitude"][-1]: coordinates["latitude"] = coordinates["latitude"][::-1] for i in dimorder.keys(): if dimorder[i] == "latitude": if i == 0: data = np.flipud(data) elif i == 1: data = np.fliplr(data) else: raise ValueError("flipping data array for ascending " "coordinates only works with 2d arrays!") continue out = gridded_array(data, coordinates, name) _file.end() return out
def read_hdf4(filename, name=None, coords_only=False, **kwargs): import pyhdf.SD as SD from pyhdf.error import HDF4Error try: _file = SD.SD(filename) except HDF4Error: print("Cannot open file: %s" % filename) raise # find out which dataset to read if name is None: datasets = list(_file.datasets().keys()) variables = [] for d in datasets: var = _file.select(d) if len(var.dimensions()) > 1 or var.dim(0).info()[0] != d: variables.append(d) if len(variables) > 1: raise AttributeError("There is more than one non-coordinate " "variable in the file, and you didn't " "specify which one you want me to read!") name = variables[0] # open dataset sds = _file.select(name) # open the coordinate variables dims = sds.dimensions(full=True) dimorder = {dims[k][1]: k for k in list(dims.keys())} coordinates = OrderedDict() for d in range(len(dimorder)): coordinates[dimorder[d]] = _file.select(dimorder[d])[:] # coordinate slicing slices = get_coordinate_slices(coordinates, kwargs) # slice the coordinate arrays themselves for i, c in enumerate(list(coordinates.keys())): coordinates[c] = coordinates[c][slices[i]] if coords_only: return coordinates # read requested slice from disk data = sds[slices] fill = sds.getfillvalue() if fill is not None and not np.isnan(fill): data = np.where(data != fill, data, np.nan) # make sure latitudes go from S to N if coordinates['latitude'][0] > coordinates['latitude'][-1]: coordinates['latitude'] = coordinates['latitude'][::-1] for i in dimorder.keys(): if dimorder[i] == "latitude": if i == 0: data = np.flipud(data) elif i == 1: data = np.fliplr(data) else: raise ValueError("flipping data array for ascending " "coordinates only works with 2d arrays!") continue out = gridded_array(data, coordinates, name) _file.end() return out
def read_gdal(filename, band=1, coords_only=False, **kwargs): """Read a ``gridded_array`` object via the GDAL library Parameters ---------- filename : str path of the h5 file to be read band : int if more than one array is contained in the file, chose the one with the RasterBand id ``band`` (starting at 1). coords_only : bool if ``True``, return only the coordinate arrays; no actual data is read kwargs : tuple slicing of the input array can be specified using *kwargs*. The name of the argument must match the name of the coordinate variable in the opened file, and the argument's value must be a tuple of ``(lower_bound, upper_bound)`` of the coordinate variable. One or both of the bounds can be ``None``, in which case the bound will be set to include all data in that direction. .. note:: The bounds given as *kwargs* are **inclusive** bounds. .. warning:: Passing ``None`` as upper and/or lower bound is not supported yet Returns ------- out : gridded_array Notes ----- .. todo:: **TODO** read ``AREA_OR_POINT`` from raster band definition """ from osgeo import gdal from osgeo.gdalconst import GA_ReadOnly _file = gdal.Open(filename, GA_ReadOnly) # read coordinates _geo = _file.GetGeoTransform() minlon, lonstep, tmp0, maxlat, tmp1, latstep = _geo nlon = _file.RasterXSize nlat = _file.RasterYSize coordinates = OrderedDict() coordinates["longitude"] = np.linspace(minlon + 0.5 * lonstep, minlon + (nlon - 0.5) * lonstep, nlon) coordinates["latitude"] = np.linspace(maxlat + 0.5 * latstep, maxlat + (nlat - 0.5) * latstep, nlat) # coordinate slicing slices = get_coordinate_slices(coordinates, kwargs) # slice the coordinate arrays themselves for i, c in enumerate(list(coordinates.keys())): coordinates[c] = coordinates[c][slices[i]] if coords_only: return coordinates # find out which rasterband to read band = _file.GetRasterBand(band) data = band.ReadAsArray() fill = band.GetNoDataValue() if fill is not None and not np.isnan(fill): data = np.where(data != fill, data, np.nan) # TODO: check if data and lats need to be reordered # if np.diff(lats).max() < 0.: # lats = lats[::-1] # data = data[::-1] # read requested slice from disk data = data[slices] out = gridded_array(data, coordinates, "") return out
def read_hdf5(filename, name=None, coords_only=False, **kwargs): """Read a ``gridded_array`` object from a pytables HDF5 file Parameters ---------- filename : str path of the h5 file to be read name : str Full path to the data node in the HDF5 file. If *name* does not start with a *slash* ``/``, ``read_hdf5`` will attempt to read the node with name *name* in the group ``/data``. If ``None``, ``read_hdf5`` will attempt to find exactly one array in the group ``/data`` and read this; otherwise, an exception is raised. coords_only : bool if ``True``, return only the coordinate arrays; no actual data is read kwargs : tuple slicing of the input array can be specified using *kwargs*. The name of the argument must match the name of the coordinate variable in the opened file, and the argument's value must be a tuple of ``(lower_bound, upper_bound)`` of the coordinate variable. One or both of the bounds can be ``None``, in which case the bound will be set to include all data in that direction. .. note:: The bounds given as *kwargs* are **inclusive** bounds. .. warning:: Passing ``None`` as upper and/or lower bound is not supported yet Returns ------- out : gridded_array Note ---- ``read_hdf5`` expects to find the names of the coordinate dimensions in a data attribute named ``COORDINATES``. It will first search for nodes with these names in the current group, and if no coordinate dimension arrays are contained in that group, it will try to read the coordinate dimensions from the group ``/coordinates``. """ import pytz import tables as tb import pkg_resources pkg_resources.require("numpy>=1.7.1") # needed for datetime stuff _fd = tb.openFile(filename, "r") if str(name).startswith("/"): try: _ds = _fd.getNode(str(name)) except: raise ValueError("I cannot read the dataset at node %s" % name) else: try: _nodes = _fd.listNodes("/data") except: raise ValueError( "You didn't specify a full path to the dataset " "you want me to read, but there is no group " "/data in the file." ) if name is None and len(_nodes) != 1: raise ValueError( "You didn't provide a dataset name, and this " "file contains more or less than exactly one " "dataset in the group /data." ) # support multiple datasets per file via "name" parameter _dsidx = 0 # TODO: proper exception handling try: _dsidx = 0 if name is None else [v.name for v in _nodes].index(name) except: raise ValueError("You asked me to read dataset %s from group " "/data, but this dataset doesn't exist") _ds = _nodes[_dsidx] # read coordinates _dsgroup = _ds._v_parent._v_pathname coord_names = _ds.attrs.COORDINATES def _read_coords_from_group(grp, coord_names): coordinates = OrderedDict() for c in coord_names: coordinates[c] = _fd.getNode("%s/%s" % (grp, c))[:] if c in ["time", "date", "datetime"]: # TODO: make the list of time labels generic # TODO: allow for setting timzeone in variable attrs if coordinates[c].dtype == np.dtype("S20"): coordinates[c] = np.asarray(coordinates[c], dtype="datetime64[us]") else: ts = [ datetime.datetime.fromtimestamp(coordinates[c][i], tz=pytz.utc) for i in range(coordinates[c].size) ] coordinates[c] = np.datetime64(ts, "us") return coordinates for grp_ in ["", "/coordinates", _dsgroup]: try: coordinates = _read_coords_from_group(grp_, coord_names) continue except tb.NoSuchNodeError: continue raise AttributeError("I cannot find any coordinate variable data " "for the requeted data object") # coordinate slicing slices = get_coordinate_slices(coordinates, kwargs) # slice the coordinate arrays themselves for i, c in enumerate(coord_names): coordinates[c] = coordinates[c][slices[i]] if coords_only: return coordinates # read requested slice from disk data = _ds[slices] out = gridded_array(data, coordinates, _ds.name) del _ds try: del _nodes except: pass _fd.close() return out
def read_netcdf4(filename, name=None, coords_only=False, **kwargs): """Read a ``gridded_array`` object from a netCDF file Parameters ---------- filename : str path of the netCDF file to be read name : str if more than one array is contained in the file, chose the one with name ``name``. If ``name`` contains slashes ``/``, these slashes will be interpreted as group path. coords_only : bool if ``True``, return only the coordinate arrays; no actual data is read kwargs : tuple slicing of the input array can be specified using *kwargs*. The name of the argument must match the name of the coordinate variable in the opened file, and the argument's value must be a tuple of ``(lower_bound, upper_bound)`` of the coordinate variable. One or both of the bounds can be ``None``, in which case the bound will be set to include all data in that direction. .. note:: The bounds given as *kwargs* are **inclusive** bounds. .. warning:: Passing ``None`` as upper and/or lower bound is not supported yet Returns ------- out : gridded_array Notes ----- This function can only read files where no two 1-dimensional variables share the same dimension. .. todo:: Implement climatologies according to CF-conventions """ import netCDF4 try: _file = netCDF4.Dataset(filename, "r") except: raise IOError("Cannot open netCDF4 file %s" % filename) # entangle dimension / variable mess dimensions = _guess_netcdf_dimensions(_file) # Which data variables are in the file? if name is None: # list subtraction. datavars is all variable labels which are # not label of a dimension variable datavars = list(set(_file.variables.keys()).difference(set([n for (n, s) in list(dimensions.values())]))) # additionally, we remove some typical variable names which arise from # netcdf conventions for varname in ["climatology_bounds", "crs"]: if varname in datavars: datavars.pop(datavars.index(varname)) if len(datavars) > 1: raise AttributeError( "There is more than one non-coordinate " "variable in the file, and you didn't " "specify which one you want me to read!" ) name = datavars[0] # check if we need to traverse groups grouppath = name.split("/") if len(grouppath) == 1: datavar = _file.variables[name] else: groups_tmp = [] for g in grouppath[:-1]: if len(groups_tmp) == 0: groups_tmp.append(_file.groups[g]) else: groups_tmp.append(groups_tmp[-1].groups[g]) datavar = groups_tmp[-1].variables[grouppath[-1]] # Read coordinates coord_shortnames = datavar.dimensions # the name of the nc-dimension coord_stdnames = [dimensions[dim][1] for dim in coord_shortnames] # coord_stdnames = [s for (n, s) in dimensions.values()] # nc-std-names coord_names = {k: str(v) for (k, v) in zip(coord_shortnames, coord_stdnames)} # our names coordinates = OrderedDict() for var in coord_shortnames: coordinates[coord_names[var]] = _file.variables[dimensions[var][0]][:] if coord_names[var] in ["time", "date", "datetime"]: # TODO _calendar = ( _file.variables[var].getncattr("calendar") if "calendar" in _file.variables[var].ncattrs() else "standard" ) tmpdates = netCDF4.num2date( coordinates[coord_names[var]], _file.variables[var].getncattr("units"), _calendar ) tmpdates = np.array([np.datetime64(tmpdates[i]) for i in range(tmpdates.size)]) coordinates[coord_names[var]] = tmpdates # coordinate slicing slices = get_coordinate_slices(coordinates, kwargs) # slice the coordinate arrays themselves for i, c in enumerate(list(coordinates.keys())): coordinates[c] = coordinates[c][slices[i]] if coords_only: return coordinates # read requested slice from disk data = datavar[slices] # mask array try: _fill = datavar.getncattr("_FillValue") except: _fill = None if _fill is not None and not np.isnan(_fill): data = np.where(data != _fill, data, np.nan) dataname = datavar.standard_name if "standard_name" in datavar.ncattrs() else name out = gridded_array(data, coordinates, dataname) _file.close() del data return out
def read_gdal(filename, band=1, coords_only=False, **kwargs): """Read a ``gridded_array`` object via the GDAL library Parameters ---------- filename : str path of the h5 file to be read band : int if more than one array is contained in the file, chose the one with the RasterBand id ``band`` (starting at 1). coords_only : bool if ``True``, return only the coordinate arrays; no actual data is read kwargs : tuple slicing of the input array can be specified using *kwargs*. The name of the argument must match the name of the coordinate variable in the opened file, and the argument's value must be a tuple of ``(lower_bound, upper_bound)`` of the coordinate variable. One or both of the bounds can be ``None``, in which case the bound will be set to include all data in that direction. .. note:: The bounds given as *kwargs* are **inclusive** bounds. .. warning:: Passing ``None`` as upper and/or lower bound is not supported yet Returns ------- out : gridded_array Notes ----- .. todo:: **TODO** read ``AREA_OR_POINT`` from raster band definition """ from osgeo import gdal from osgeo.gdalconst import GA_ReadOnly _file = gdal.Open(filename, GA_ReadOnly) # read coordinates _geo = _file.GetGeoTransform() minlon, lonstep, tmp0, maxlat, tmp1, latstep = _geo nlon = _file.RasterXSize nlat = _file.RasterYSize coordinates = OrderedDict() coordinates['longitude'] = np.linspace(minlon + .5 * lonstep, minlon + (nlon - .5) * lonstep, nlon) coordinates['latitude'] = np.linspace(maxlat + .5 * latstep, maxlat + (nlat - .5) * latstep, nlat) # coordinate slicing slices = get_coordinate_slices(coordinates, kwargs) # slice the coordinate arrays themselves for i, c in enumerate(list(coordinates.keys())): coordinates[c] = coordinates[c][slices[i]] if coords_only: return coordinates # find out which rasterband to read band = _file.GetRasterBand(band) data = band.ReadAsArray() fill = band.GetNoDataValue() if fill is not None and not np.isnan(fill): data = np.where(data != fill, data, np.nan) # TODO: check if data and lats need to be reordered #if np.diff(lats).max() < 0.: # lats = lats[::-1] # data = data[::-1] # read requested slice from disk data = data[slices] out = gridded_array(data, coordinates, "") return out
def read_hdf5(filename, name=None, coords_only=False, **kwargs): """Read a ``gridded_array`` object from a pytables HDF5 file Parameters ---------- filename : str path of the h5 file to be read name : str Full path to the data node in the HDF5 file. If *name* does not start with a *slash* ``/``, ``read_hdf5`` will attempt to read the node with name *name* in the group ``/data``. If ``None``, ``read_hdf5`` will attempt to find exactly one array in the group ``/data`` and read this; otherwise, an exception is raised. coords_only : bool if ``True``, return only the coordinate arrays; no actual data is read kwargs : tuple slicing of the input array can be specified using *kwargs*. The name of the argument must match the name of the coordinate variable in the opened file, and the argument's value must be a tuple of ``(lower_bound, upper_bound)`` of the coordinate variable. One or both of the bounds can be ``None``, in which case the bound will be set to include all data in that direction. .. note:: The bounds given as *kwargs* are **inclusive** bounds. .. warning:: Passing ``None`` as upper and/or lower bound is not supported yet Returns ------- out : gridded_array Note ---- ``read_hdf5`` expects to find the names of the coordinate dimensions in a data attribute named ``COORDINATES``. It will first search for nodes with these names in the current group, and if no coordinate dimension arrays are contained in that group, it will try to read the coordinate dimensions from the group ``/coordinates``. """ import pytz import tables as tb import pkg_resources pkg_resources.require("numpy>=1.7.1") # needed for datetime stuff _fd = tb.openFile(filename, "r") if str(name).startswith('/'): try: _ds = _fd.getNode(str(name)) except: raise ValueError("I cannot read the dataset at node %s" % name) else: try: _nodes = _fd.listNodes("/data") except: raise ValueError("You didn't specify a full path to the dataset " "you want me to read, but there is no group " "/data in the file.") if name is None and len(_nodes) != 1: raise ValueError("You didn't provide a dataset name, and this " "file contains more or less than exactly one " "dataset in the group /data.") # support multiple datasets per file via "name" parameter _dsidx = 0 # TODO: proper exception handling try: _dsidx = (0 if name is None else [v.name for v in _nodes].index(name)) except: raise ValueError("You asked me to read dataset %s from group " "/data, but this dataset doesn't exist") _ds = _nodes[_dsidx] # read coordinates _dsgroup = _ds._v_parent._v_pathname coord_names = _ds.attrs.COORDINATES def _read_coords_from_group(grp, coord_names): coordinates = OrderedDict() for c in coord_names: coordinates[c] = _fd.getNode("%s/%s" % (grp, c))[:] if c in [ "time", "date", "datetime", ]: # TODO: make the list of time labels generic # TODO: allow for setting timzeone in variable attrs if coordinates[c].dtype == np.dtype("S20"): coordinates[c] = np.asarray(coordinates[c], dtype="datetime64[us]") else: ts = [ datetime.datetime.fromtimestamp(coordinates[c][i], tz=pytz.utc) for i in range(coordinates[c].size) ] coordinates[c] = np.datetime64(ts, "us") return coordinates for grp_ in ["", "/coordinates", _dsgroup]: try: coordinates = _read_coords_from_group(grp_, coord_names) continue except tb.NoSuchNodeError: continue raise AttributeError("I cannot find any coordinate variable data " "for the requeted data object") # coordinate slicing slices = get_coordinate_slices(coordinates, kwargs) # slice the coordinate arrays themselves for i, c in enumerate(coord_names): coordinates[c] = coordinates[c][slices[i]] if coords_only: return coordinates # read requested slice from disk data = _ds[slices] out = gridded_array(data, coordinates, _ds.name) del _ds try: del _nodes except: pass _fd.close() return out
def read_netcdf4(filename, name=None, coords_only=False, **kwargs): """Read a ``gridded_array`` object from a netCDF file Parameters ---------- filename : str path of the netCDF file to be read name : str if more than one array is contained in the file, chose the one with name ``name``. If ``name`` contains slashes ``/``, these slashes will be interpreted as group path. coords_only : bool if ``True``, return only the coordinate arrays; no actual data is read kwargs : tuple slicing of the input array can be specified using *kwargs*. The name of the argument must match the name of the coordinate variable in the opened file, and the argument's value must be a tuple of ``(lower_bound, upper_bound)`` of the coordinate variable. One or both of the bounds can be ``None``, in which case the bound will be set to include all data in that direction. .. note:: The bounds given as *kwargs* are **inclusive** bounds. .. warning:: Passing ``None`` as upper and/or lower bound is not supported yet Returns ------- out : gridded_array Notes ----- This function can only read files where no two 1-dimensional variables share the same dimension. .. todo:: Implement climatologies according to CF-conventions """ import netCDF4 try: _file = netCDF4.Dataset(filename, 'r') except: raise IOError("Cannot open netCDF4 file %s" % filename) # entangle dimension / variable mess dimensions = _guess_netcdf_dimensions(_file) # Which data variables are in the file? if name is None: # list subtraction. datavars is all variable labels which are # not label of a dimension variable datavars = list( set(_file.variables.keys()).difference( set([n for (n, s) in list(dimensions.values())]))) # additionally, we remove some typical variable names which arise from # netcdf conventions for varname in [ 'climatology_bounds', 'crs', ]: if varname in datavars: datavars.pop(datavars.index(varname)) if len(datavars) > 1: raise AttributeError("There is more than one non-coordinate " "variable in the file, and you didn't " "specify which one you want me to read!") name = datavars[0] # check if we need to traverse groups grouppath = name.split("/") if len(grouppath) == 1: datavar = _file.variables[name] else: groups_tmp = [] for g in grouppath[:-1]: if len(groups_tmp) == 0: groups_tmp.append(_file.groups[g]) else: groups_tmp.append(groups_tmp[-1].groups[g]) datavar = groups_tmp[-1].variables[grouppath[-1]] # Read coordinates coord_shortnames = datavar.dimensions # the name of the nc-dimension coord_stdnames = [dimensions[dim][1] for dim in coord_shortnames] #coord_stdnames = [s for (n, s) in dimensions.values()] # nc-std-names coord_names = { k: str(v) for (k, v) in zip( coord_shortnames, # our names coord_stdnames) } coordinates = OrderedDict() for var in coord_shortnames: coordinates[coord_names[var]] = _file.variables[dimensions[var][0]][:] if coord_names[var] in ['time', 'date', 'datetime']: # TODO _calendar = (_file.variables[var].getncattr('calendar') if 'calendar' in _file.variables[var].ncattrs() else 'standard') tmpdates = netCDF4.num2date( coordinates[coord_names[var]], _file.variables[var].getncattr('units'), _calendar) tmpdates = np.array( [np.datetime64(tmpdates[i]) for i in range(tmpdates.size)]) coordinates[coord_names[var]] = tmpdates # coordinate slicing slices = get_coordinate_slices(coordinates, kwargs) # slice the coordinate arrays themselves for i, c in enumerate(list(coordinates.keys())): coordinates[c] = coordinates[c][slices[i]] if coords_only: return coordinates # read requested slice from disk data = datavar[slices] # mask array try: _fill = datavar.getncattr('_FillValue') except: _fill = None if _fill is not None and not np.isnan(_fill): data = np.where(data != _fill, data, np.nan) dataname = (datavar.standard_name if 'standard_name' in datavar.ncattrs() else name) out = gridded_array(data, coordinates, dataname) _file.close() del data return out