def convert(indir, out_fname, compression, filter_opts): """ Convert GA's ozone TIFF's to HDF5. The TIFF's will be converted into HDF5 IMAGE Datasets, and contained within a single HDF5 file. """ # convert to Path object indir = Path(indir) # create empty or copy the user supplied filter options if not filter_opts: filter_opts = dict() else: filter_opts = filter_opts.copy() with h5py.File(str(out_fname), 'w') as fid: for fname in indir.glob('*.tif'): with rasterio.open(str(fname)) as rds: # the files have small dimensions, so store as a single chunk if 'chunks' not in filter_opts: filter_opts['chunks'] = (rds.height, rds.width) attrs = { 'description': 'Ozone data compiled by Geoscience Australia', 'geotransform': rds.transform.to_gdal(), 'crs_wkt': rds.crs.wkt } # output dname = fname.stem write_h5_image(rds.read(1), dname, fid, compression, attrs, filter_opts)
def test_write_h5_image_minmax(self): """ Test the IMAGE_MINMAXRANGE attribute is correct. """ minmax = numpy.array([self.image_data.min(), self.image_data.max()]) fname = "test_write_h5_image_minmax.h5" with h5py.File(fname, "w", **self.memory_kwargs) as fid: hdf5.write_h5_image(self.image_data, "image", fid) test = fid["image"].attrs["IMAGE_MINMAXRANGE"] self.assertTrue((minmax == test).all())
def convert(indir, out_h5: h5py.Group, compression, filter_opts): """ Convert GA's ozone TIFF's to HDF5. The TIFF's will be converted into HDF5 IMAGE Datasets, and contained within a single HDF5 file. """ # convert to Path object indir = Path(indir) dataset_names = [] metadata = [] # create empty or copy the user supplied filter options if not filter_opts: filter_opts = dict() else: filter_opts = filter_opts.copy() for fname in sorted(indir.glob("*.tif"), key=_month_sort): with rasterio.open(str(fname)) as rds: # the files have small dimensions, so store as a single chunk if "chunks" not in filter_opts: filter_opts["chunks"] = (rds.height, rds.width) attrs = { "description": "Ozone data compiled by Geoscience Australia", "geotransform": rds.transform.to_gdal(), "crs_wkt": rds.crs.wkt, } # output dname = fname.stem write_h5_image(rds.read(1), dname, out_h5, compression, attrs, filter_opts) # src checksum; used to help derive fallback uuid with fname.open("rb") as src: src_checksum = generate_md5sum(src).hexdigest() dataset_names.append(dname) metadata.append({ "id": str( generate_fallback_uuid(PRODUCT_HREF, path=str(dname), md5=src_checksum)) }) return metadata, dataset_names
def test_write_h5_image(self): """ Test the write_h5_image function. """ data = self.image_data fname = "test_write_h5_image.h5" with h5py.File(fname, "w", **self.memory_kwargs) as fid: self.assertIsNone(hdf5.write_h5_image(data, "image", fid))
def test_write_h5_image_attributes(self): """ Test the image attributes of the write_h5_image function. """ attrs = {'CLASS': 'IMAGE', 'IMAGE_VERSION': '1.2', 'DISPLAY_ORIGIN': 'UL'} fname = 'test_write_h5_image_attributes.h5' with h5py.File(fname, **self.memory_kwargs) as fid: hdf5.write_h5_image(self.image_data, 'image', fid) test = {k: v for k, v in fid['image'].attrs.items()} # assertDictEqual can't compare a numpy array, so test elsewhere del test['IMAGE_MINMAXRANGE'] self.assertDictEqual(test, attrs)
def save_as_h5_dataset(self, out_group, acq, product, compression=H5CompressionFilter.LZF, filter_opts=None): """ Save the PQ result and attribute information in a HDF5 `IMAGE` Class dataset. """ if filter_opts is None: fopts = {} else: fopts = filter_opts.copy() fopts['chunks'] = acq.tile_size attrs = self.aux_data.copy() attrs['crs_wkt'] = self.geobox.crs.ExportToWkt() attrs['geotransform'] = self.geobox.transform.to_gdal() dname = DatasetName.PQ_FMT.value.format(product=product.value) write_h5_image(self.array, dname, out_group, compression, attrs, fopts)
def test_write_h5_image_attributes(self): """ Test the image attributes of the write_h5_image function. """ attrs = { "CLASS": "IMAGE", "IMAGE_VERSION": "1.2", "DISPLAY_ORIGIN": "UL" } fname = "test_write_h5_image_attributes.h5" with h5py.File(fname, "w", **self.memory_kwargs) as fid: hdf5.write_h5_image(self.image_data, "image", fid) test = {k: v for k, v in fid["image"].attrs.items()} # assertDictEqual can't compare a numpy array, so test elsewhere del test["IMAGE_MINMAXRANGE"] self.assertDictEqual(test, attrs)
def _convert_2d(rds, fid, dataset_name, compression, filter_opts): """ Private routine for converting the 2D GRIB file to HDF5. """ attrs = { 'geotransform': rds.transform.to_gdal(), 'crs_wkt': rds.crs.wkt, 'history': 'Converted to HDF5' } data = rds.read(1) write_h5_image(data, dataset_name, fid, compression, attrs, filter_opts) # add dimension labels, but should we also include dimension scales? dataset = fid[dataset_name] dataset.dims[0].label = 'Y' dataset.dims[1].label = 'X' # metadata metadata = metadata_dataframe(rds) write_dataframe(metadata, 'METADATA', fid, compression)
def _convert_2d(rds, fid, dataset_name, compression, filter_opts): """ Private routine for converting the 2D GRIB file to HDF5. """ attrs = { "geotransform": rds.transform.to_gdal(), "crs_wkt": rds.crs.wkt, "history": "Converted to HDF5", } data = rds.read(1) write_h5_image(data, dataset_name, fid, compression, attrs, filter_opts) # add dimension labels, but should we also include dimension scales? dataset = fid[dataset_name] dataset.dims[0].label = "Y" dataset.dims[1].label = "X" # metadata metadata = metadata_dataframe(rds) write_dataframe(metadata, "METADATA", fid, compression)
def convert_format(self, dataset_name, group, attrs=None, compression=H5CompressionFilter.LZF, filter_opts=None): """ Convert the HDF file to a HDF5 dataset. """ if attrs is None: attrs = {} # Get the UL corner of the UL pixel co-ordinate ul_lon = self.ul[0] ul_lat = self.ul[1] # pixel size x & y pixsz_x = self.delta_lon pixsz_y = self.delta_lat # Setup the projection; assuming Geographics WGS84 # (Tests have shown that this appears to be the case) # (unfortunately it is not expicitly defined in the HDF file) sr = osr.SpatialReference() sr.SetWellKnownGeogCS("WGS84") prj = sr.ExportToWkt() # Setup the geobox dims = self.data[0].shape res = (abs(pixsz_x), abs(pixsz_y)) geobox = GriddedGeoBox(shape=dims, origin=(ul_lon, ul_lat), pixelsize=res, crs=prj) # Write the dataset attrs['description'] = 'Converted BRDF data from H4 to H5.' attrs['crs_wkt'] = prj attrs['geotransform'] = geobox.transform.to_gdal() write_h5_image(self.data[0], dataset_name, group, compression, attrs, filter_opts)
def test_write_h5_image_multiband(self): """ Test the {BAND}_MINMAXRANGE attribute is correct. """ band_names = ["ISO", "VOL", "GEO"] dtype = numpy.dtype([(bname, "int16") for bname in band_names]) dataset = numpy.ndarray(shape=self.image_data.shape, dtype=dtype) for bname in band_names: dataset[bname] = self.image_data minmax = numpy.array([self.image_data.min(), self.image_data.max()]) fname = "test_write_h5_multi.h5" with h5py.File(fname, "w", **self.memory_kwargs) as fid: hdf5.write_h5_image(dataset, "image", fid) self.assertFalse("IMAGE_MINMAXRANGE" in fid["image"].attrs) for bname in band_names: test = fid["image"].attrs["{}_MINMAXRANGE".format(bname)] self.assertTrue((minmax == test).all())
def test_write_h5_image_multiband(self): """ Test the {BAND}_MINMAXRANGE attribute is correct. """ band_names = ['ISO', 'VOL', 'GEO'] dtype = numpy.dtype([(bname, 'int16') for bname in band_names]) dataset = numpy.ndarray(shape=self.image_data.shape, dtype=dtype) for bname in band_names: dataset[bname] = self.image_data minmax = numpy.array([self.image_data.min(), self.image_data.max()]) fname = 'test_write_h5_multi.h5' with h5py.File(fname, **self.memory_kwargs) as fid: hdf5.write_h5_image(dataset, 'image', fid) self.assertFalse('IMAGE_MINMAXRANGE' in fid['image'].attrs) for bname in band_names: test = fid['image'].attrs['{}_MINMAXRANGE'.format(bname)] self.assertTrue((minmax == test).all())
def prwtr_average(indir, outdir, compression=H5CompressionFilter.LZF, filter_opts=None): """ Take the 4 hourly daily average from all files. """ df = build_index(indir) # grouping groups = df.groupby([df.index.month, df.index.day, df.index.hour]) # create directories as needed out_fname = Path(outdir).joinpath("pr_wtr_average.h5") if not out_fname.parent.exists(): out_fname.parent.mkdir(parents=True) # create output file with h5py.File(str(out_fname), 'w') as fid: # the data is ordered so we can safely use BAND-1 = Jan-1 for band_index, item in enumerate(groups): grp_name, grp_df = item # synthesised leap year timestamp (use year 2000) fmt = "2000 {:02d} {:02d} {:02d}" dtime = datetime.strptime(fmt.format(*grp_name), "%Y %m %d %H") # mean mean, geobox, chunks = calculate_average(grp_df) # dataset name format "%B-%d/%H%M" eg FEBRUARY-06/1800 for Feb 6th 1800 hrs dname = "AVERAGE/{}".format(dtime.strftime("%B-%d/%H%M").upper()) # dataset description description = ("Average data for {year_month} {hour}00 hours, " "over the timeperiod {dt_min} to {dt_max}") description = description.format( year_month=dtime.strftime("%B-%d"), hour=dtime.strftime("%H"), dt_min=grp_df.index.min(), dt_max=grp_df.index.max()) # dataset attributes attrs = { "description": description, "timestamp": dtime, "date_format": "2000 %B-%d/%H%M", "band_name": "BAND-{}".format(band_index + 1), "geotransform": geobox.transform.to_gdal(), "crs_wkt": geobox.crs.ExportToWkt() } # create empty or copy the user supplied filter options if not filter_opts: f_opts = dict() else: f_opts = filter_opts.copy() # use original chunks if none are provided if 'chunks' not in f_opts: f_opts['chunks'] = chunks # write write_h5_image(mean, dname, fid, attrs=attrs, compression=compression, filter_opts=f_opts)
def convert_tile(fname, out_fname, compression, filter_opts): """ Convert a MCD43A1 HDF4 tile into HDF5. Global and datasetl level metadata are copied across. :param fname: A str containing the MCD43A1 filename. :param out_fname: A str containing the output filename for the HDF5 file. :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: None. Content is written directly to disk. """ with h5py.File(out_fname, 'w') as fid: with rasterio.open(fname) as ds: # global attributes attach_attributes(fid, ds.tags()) # find and convert every subsdataset (sds) for sds_name in ds.subdatasets: with rasterio.open(sds_name) as sds: ds_name = Path(sds_name.replace(':', '/')).name # create empty or copy the user supplied filter options if not filter_opts: f_opts = dict() else: f_opts = filter_opts.copy() # use sds native chunks if none are provided if 'chunks' not in f_opts: f_opts['chunks'] = list(sds.block_shapes[0]) # modify to have 3D chunks if we have a multiband sds if sds.count == 3: # something could go wrong if a user supplies # a 3D chunk eg (2, 256, 340) f_opts['chunks'].insert(0, 1) f_opts['chunks'] = tuple(f_opts['chunks']) else: f_opts['chunks'] = tuple(f_opts['chunks']) # subdataset attributes and spatial attributes attrs = sds.tags() attrs['geotransform'] = sds.transform.to_gdal() attrs['crs_wkt'] = sds.crs.wkt # ensure single band sds is read a 2D not 3D data = sds.read() if sds.count == 3 else sds.read(1) # write to disk as an IMAGE Class Dataset write_h5_image(data, ds_name, fid, attrs=attrs, compression=compression, filter_opts=f_opts)
def convert_tile(fname, out_h5: h5py.Group, compression, filter_opts): """ Convert a MCD43A1 HDF4 tile into HDF5. Global and datasetl level metadata are copied across. :param fname: A str containing the MCD43A1 filename. :param out_h5: A h5py.Group to write the output data to :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: None. Content is written directly to disk. """ # read the geo-spatial information beforehand # relying on gdal to parse it geospatial = {} with rasterio.open(fname) as ds: for sds_name in ds.subdatasets: with rasterio.open(sds_name) as sds: band_name = sds_name.split(":")[-1] geospatial[band_name] = { "geotransform": sds.transform.to_gdal(), "crs_wkt": sds.crs.wkt, } # convert data with netCDF4.Dataset(fname) as ds: ds.set_auto_scale(False) # global attributes global_attrs = {key: ds.getncattr(key) for key in ds.ncattrs()} attach_attributes(out_h5, global_attrs) # find and convert every subsdataset (sds) for sds_name in sorted(ds.variables, key=_brdf_netcdf_band_orderer): sds = ds.variables[sds_name] # create empty or copy the user supplied filter options if not filter_opts: f_opts = dict() else: f_opts = filter_opts.copy() # Recreate datasets as 2-dimensional dataset dim1, dim2, *_ = sds.shape if "chunks" not in f_opts: assert dim1 == 2400 and dim2 == 2400 f_opts["chunks"] = (240, 240) else: f_opts["chunks"] = (f_opts[0], f_opts[1]) # subdataset attributes and spatial attributes attrs = {key: sds.getncattr(key) for key in sds.ncattrs()} # attrs['geotransform'] = sds.transform.to_gdal() # attrs['crs_wkt'] = sds.crs.wkt attrs.update(geospatial[sds_name]) in_arr = sds[:] if len(in_arr.shape) == 3: data = numpy.ndarray(shape=(dim1, dim2), dtype=OUT_DTYPE) for idx, band_name in enumerate(OUT_DTYPE.names): data[band_name] = in_arr[:, :, idx] else: data = in_arr # write to disk as an IMAGE Class Dataset write_h5_image( data, sds_name, out_h5, attrs=attrs, compression=compression, filter_opts=f_opts, )
def fallback(indir, outdir, compression=H5CompressionFilter.LZF, filter_opts: Optional[Dict] = None): """ Take the 4 hourly daily average from all files. """ df = _build_index(indir) # grouping groups = df.groupby([df.index.month, df.index.day, df.index.hour]) # create directories as needed out_fname = Path(outdir).joinpath("pr_wtr.eatm.average.h5") out_fname.parent.mkdir(exist_ok=True, parents=True) # Set one creation datetime for all datasets creation_dt = datetime.utcnow().replace(tzinfo=timezone.utc) # create output file with h5utils.atomic_h5_write(out_fname, "w", track_order=True) as fid: # the data is ordered so we can safely use BAND-1 = Jan-1 for band_index, item in enumerate(groups): grp_name, grp_df = item # synthesised leap year timestamp (use year 2000) fmt = "2000 {:02d} {:02d} {:02d} +0000" dtime = datetime.strptime(fmt.format(*grp_name), "%Y %m %d %H %z") # mean mean, geobox, chunks = _average(grp_df) # dataset name format "%B-%d/%H%M" eg FEBRUARY-06/1800 for Feb 6th 1800 hrs dname = "AVERAGE/{}".format(dtime.strftime("%B-%d/%H%M").upper()) # dataset description description = ("Average data for {year_month} {hour} hours, " "over the time period {dt_min} to {dt_max}") description = description.format( year_month=dtime.strftime("%B-%d"), hour=dtime.strftime("%H%M"), dt_min=grp_df.index.min().date(), dt_max=grp_df.index.max().date(), ) # dataset attributes attrs = { "description": description, "timestamp": dtime, "date_format": "2000 %B-%d/%H%M", "band_name": "BAND-{}".format(band_index + 1), "geotransform": geobox.transform.to_gdal(), "crs_wkt": geobox.crs.ExportToWkt(), } # create empty or copy the user supplied filter options if not filter_opts: f_opts = dict() else: f_opts = filter_opts.copy() # use original chunks if none are provided if "chunks" not in f_opts: f_opts["chunks"] = chunks # write h5utils.create_groups(fid, dname.rsplit("/", 1)[0], track_order=True) write_h5_image(mean, dname, fid, attrs=attrs, compression=compression, filter_opts=f_opts) # Generate metadata lineage_ids, src_md = _get_lineage_md(grp_df) md = generate_fallback_metadata( dname, lineage_ids=lineage_ids, obs_dt=dtime, start_dt=grp_df.index.min().replace(tzinfo=timezone.utc), end_dt=grp_df.index.max().replace(tzinfo=timezone.utc), creation_dt=creation_dt, src_md=src_md, ) h5utils.write_h5_md(fid, [md], [dname], track_order=True)
def convert_file(fname, out_h5: h5py.Group, compression, filter_opts: Optional[Dict] = None): """ Convert a PR_WTR NetCDF file into HDF5. :param fname: A str containing the PR_WTR filename. :param out_fname: A h5py.Group to write output datasets to :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: None. Content is written directly to disk. """ with rasterio.open(fname) as ds: name_fmt = "BAND-{}" # global attributes # TODO update the history attrs # TODO remove the NC_GLOBAL str and just have plain attr names g_attrs = ds.tags() # get timestamp info origin = g_attrs.pop("time#units").replace("hours since ", "") hours = json.loads( g_attrs.pop("NETCDF_DIM_time_VALUES").replace("{", "[").replace( "}", "]")) df = pandas.DataFrame({ "timestamp": pandas.to_datetime(hours, unit="h", origin=origin), "band_name": [name_fmt.format(i + 1) for i in range(ds.count)], }) df["dataset_name"] = df.timestamp.dt.strftime("%Y/%B-%d/%H%M") df["dataset_name"] = df["dataset_name"].str.upper() # create a timestamp and band name index table dataset desc = "Timestamp and Band Name index information." attrs = {"description": desc} write_dataframe(df, "INDEX", out_h5, compression, attrs=attrs) attach_attributes(out_h5, g_attrs) # process every band for i in range(1, ds.count + 1): ds_name = df.iloc[i - 1].dataset_name # create empty or copy the user supplied filter options if not filter_opts: f_opts = dict() else: f_opts = filter_opts.copy() # band attributes # TODO remove NETCDF tags # TODO add fillvalue attr attrs = ds.tags(i) attrs["timestamp"] = df.iloc[i - 1]["timestamp"].replace( tzinfo=timezone.utc) attrs["band_name"] = df.iloc[i - 1]["band_name"] attrs["geotransform"] = ds.transform.to_gdal() attrs["crs_wkt"] = CRS.ExportToWkt() # use ds native chunks if none are provided if "chunks" not in f_opts: try: f_opts["chunks"] = ds.block_shapes[i] except IndexError: print("Chunk error: {}".format(fname)) f_opts["chunks"] = (73, 144) # write to disk as an IMAGE Class Dataset write_h5_image( ds.read(i), ds_name, out_h5, attrs=attrs, compression=compression, filter_opts=f_opts, )
def convert_file(fname, out_fname, compression, filter_opts): """ Convert a PR_WTR NetCDF file into HDF5. :param fname: A str containing the PR_WTR filename. :param out_fname: A str containing the output filename for the HDF5 file. :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: None. Content is written directly to disk. """ with h5py.File(out_fname, 'w') as fid: with rasterio.open(fname) as ds: name_fmt = 'BAND-{}' # global attributes # TODO update the history attrs # TODO remove the NC_GLOBAL str and just have plain attr names g_attrs = ds.tags() # get timestamp info origin = g_attrs.pop('time#units').replace('hours since ', '') hours = json.loads( g_attrs.pop('NETCDF_DIM_time_VALUES').replace('{', '[').replace('}', ']') ) df = pandas.DataFrame( { 'timestamp': pandas.to_datetime(hours, unit='h', origin=origin), 'band_name': [name_fmt.format(i+1) for i in range(ds.count)] } ) df['dataset_name'] = df.timestamp.dt.strftime('%Y/%B-%d/%H%M') df['dataset_name'] = df['dataset_name'].str.upper() # create a timestamp and band name index table dataset desc = "Timestamp and Band Name index information." attrs = { 'description': desc } write_dataframe(df, 'INDEX', fid, compression, attrs=attrs) attach_attributes(fid, g_attrs) # process every band for i in range(1, ds.count + 1): ds_name = df.iloc[i-1].dataset_name # create empty or copy the user supplied filter options if not filter_opts: f_opts = dict() else: f_opts = filter_opts.copy() # band attributes # TODO remove NETCDF tags # TODO add fillvalue attr attrs = ds.tags(i) attrs['timestamp'] = df.iloc[i-1]['timestamp'] attrs['band_name'] = df.iloc[i-1]['band_name'] attrs['geotransform'] = ds.transform.to_gdal() attrs['crs_wkt'] = CRS.ExportToWkt() # use ds native chunks if none are provided if 'chunks' not in f_opts: try: f_opts['chunks'] = ds.block_shapes[i] except IndexError: print("Chunk error: {}".format(fname)) f_opts['chunks'] = (73, 144) # write to disk as an IMAGE Class Dataset write_h5_image(ds.read(i), ds_name, fid, attrs=attrs, compression=compression, filter_opts=f_opts)
def interpolate(acq, coefficient, ancillary_group, satellite_solar_group, coefficients_group, out_group=None, compression=H5CompressionFilter.LZF, filter_opts=None, method=Method.SHEARB): # TODO: more docstrings """Perform interpolation.""" if method not in Method: msg = 'Interpolation method {} not available.' raise Exception(msg.format(method.name)) geobox = acq.gridded_geo_box() cols, rows = geobox.get_shape_xy() # read the relevant tables into DataFrames coordinator = read_h5_table(ancillary_group, DatasetName.COORDINATOR.value) boxline = read_h5_table(satellite_solar_group, DatasetName.BOXLINE.value) if coefficient in Workflow.NBAR.atmos_coefficients: dataset_name = DatasetName.NBAR_COEFFICIENTS.value elif coefficient in Workflow.SBT.atmos_coefficients: dataset_name = DatasetName.SBT_COEFFICIENTS.value else: msg = "Factor name not found in available coefficients: {}" raise ValueError(msg.format(Workflow.STANDARD.atmos_coefficients)) coefficients = read_h5_table(coefficients_group, dataset_name) coord = np.zeros((coordinator.shape[0], 2), dtype='int') map_x = coordinator.map_x.values map_y = coordinator.map_y.values coord[:, 1], coord[:, 0] = (map_x, map_y) * ~geobox.transform centre = boxline.bisection_index.values start = boxline.start_index.values end = boxline.end_index.values band_records = coefficients.band_name == acq.band_name samples = coefficients[coefficient.value][band_records].values func_map = {Method.BILINEAR: sheared_bilinear_interpolate, Method.FBILINEAR: fortran_bilinear_interpolate, Method.SHEAR: sheared_bilinear_interpolate, Method.SHEARB: sheared_bilinear_interpolate, Method.RBF: rbf_interpolate} args = [cols, rows, coord, samples, start, end, centre] if method == Method.BILINEAR: args.extend([False, False]) elif method == Method.SHEARB: args.extend([True, True]) else: pass result = func_map[method](*args) # setup the output file/group as needed if out_group is None: fid = h5py.File('interpolated-coefficients.h5', driver='core', backing_store=False) else: fid = out_group if GroupName.INTERP_GROUP.value not in fid: fid.create_group(GroupName.INTERP_GROUP.value) if filter_opts is None: filter_opts = {} else: filter_opts = filter_opts.copy() filter_opts['chunks'] = acq.tile_size group = fid[GroupName.INTERP_GROUP.value] fmt = DatasetName.INTERPOLATION_FMT.value dset_name = fmt.format(coefficient=coefficient.value, band_name=acq.band_name) no_data = -999 attrs = {'crs_wkt': geobox.crs.ExportToWkt(), 'geotransform': geobox.transform.to_gdal(), 'no_data_value': no_data, 'interpolation_method': method.name, 'band_id': acq.band_id, 'band_name': acq.band_name, 'alias': acq.alias, 'coefficient': coefficient.value} desc = ("Contains the interpolated result of coefficient {} " "for band {} from sensor {}.") attrs['description'] = desc.format(coefficient.value, acq.band_id, acq.sensor_id) # convert any NaN's to -999 (for float data, NaN would be more ideal ...) result[~np.isfinite(result)] = no_data write_h5_image(result, dset_name, group, compression, attrs, filter_opts) if out_group is None: return fid
def image_residual(ref_fid, test_fid, pathname, out_fid, compression=H5CompressionFilter.LZF, save_inputs=False, filter_opts=None): """ Undertake residual analysis for IMAGE CLASS Datasets. A histogram and a cumulative histogram of the residuals are calculated and recorded as TABLE CLASS Datasets. Any NaN's in IMAGE datasets will be handled automatically. :param ref_fid: A h5py file object (essentially the root Group), containing the reference data. :param test_fid: A h5py file object (essentially the root Group), containing the test data. :param pathname: A `str` containing the pathname to the IMAGE Dataset. :param out_fid: A h5py file object (essentially the root Group), opened for writing the output data. :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :param save_inputs: A `bool` indicating whether or not to save the input datasets used for evaluating the residuals alongside the results. Default is False. :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: None; This routine will only return None or a print statement, this is essential for the HDF5 visit routine. """ def evaluate(ref_dset, test_dset): """ Evaluate the image residual. Caters for boolean types. TODO: geobox intersection if dimensions are different. TODO: handle no data values TODO: handle classification datasets TODO: handle bitwise datasets """ if ref_dset.dtype.name == 'bool': result = numpy.logical_xor(ref_dset, test_dset).astype('uint8') else: result = ref_dset[:] - test_dset return result class_name = 'IMAGE' ref_dset = ref_fid[pathname] test_dset = test_fid[pathname] # ignore no data values for the time being residual = evaluate(ref_dset, test_dset) min_residual = numpy.nanmin(residual) max_residual = numpy.nanmax(residual) pct_difference = (residual != 0).sum() / residual.size * 100 if filter_opts is None: fopts = {} else: fopts = filter_opts.copy() fopts['chunks'] = ref_dset.chunks geobox = GriddedGeoBox.from_dataset(ref_dset) # output residual attrs = { 'crs_wkt': geobox.crs.ExportToWkt(), 'geotransform': geobox.transform.to_gdal(), 'description': 'Residual', 'min_residual': min_residual, 'max_residual': max_residual, 'percent_difference': pct_difference } base_dname = pbasename(pathname) group_name = ref_dset.parent.name.strip('/') dname = ppjoin('RESULTS', class_name, 'RESIDUALS', group_name, base_dname) write_h5_image(residual, dname, out_fid, compression, attrs, fopts) # residuals distribution h = distribution(residual) hist = h['histogram'] attrs = { 'description': 'Frequency distribution of the residuals', 'omin': h['omin'], 'omax': h['omax'] } dtype = numpy.dtype([('bin_locations', h['loc'].dtype.name), ('residuals_distribution', hist.dtype.name)]) table = numpy.zeros(hist.shape, dtype=dtype) table['bin_locations'] = h['loc'] table['residuals_distribution'] = hist # output del fopts['chunks'] dname = ppjoin('RESULTS', class_name, 'FREQUENCY-DISTRIBUTIONS', group_name, base_dname) write_h5_table(table, dname, out_fid, compression, attrs=attrs, filter_opts=fopts) # cumulative distribution h = distribution(numpy.abs(residual)) hist = h['histogram'] cdf = numpy.cumsum(hist / hist.sum()) attrs = { 'description': 'Cumulative distribution of the residuals', 'omin': h['omin'], 'omax': h['omax'], '90th_percentile': h['loc'][numpy.searchsorted(cdf, 0.9)], '99th_percentile': h['loc'][numpy.searchsorted(cdf, 0.99)] } dtype = numpy.dtype([('bin_locations', h['loc'].dtype.name), ('cumulative_distribution', cdf.dtype.name)]) table = numpy.zeros(cdf.shape, dtype=dtype) table['bin_locations'] = h['loc'] table['cumulative_distribution'] = cdf # output dname = ppjoin('RESULTS', class_name, 'CUMULATIVE-DISTRIBUTIONS', group_name, base_dname) write_h5_table(table, dname, out_fid, compression=compression, attrs=attrs, filter_opts=fopts) if save_inputs: # copy the reference data out_grp = out_fid.require_group(ppjoin('REFERENCE-DATA', group_name)) ref_fid.copy(ref_dset, out_grp) # copy the test data out_grp = out_fid.require_group(ppjoin('TEST-DATA', group_name)) test_fid.copy(test_dset, out_grp)