def test_dataframe_attributes(self): """ Test the attributes that get created for a dataframe. """ attrs = {'CLASS': 'TABLE', 'FIELD_0_NAME': 'index', 'FIELD_1_NAME': 'float_data', 'FIELD_2_NAME': 'integer_data', 'TITLE': 'Table', 'VERSION': '0.2', 'float_data_dtype': 'float64', 'index_dtype': 'int64', 'index_names': numpy.array(['index'], dtype=object), 'integer_data_dtype': 'int64', 'metadata': '`Pandas.DataFrame` converted to HDF5 compound datatype.', # pylint: disable=line-too-long 'nrows': 10, 'python_type': '`Pandas.DataFrame`'} df = pandas.DataFrame(self.table_data) fname = 'test_dataframe_attributes.h5' with h5py.File(fname, **self.memory_kwargs) as fid: hdf5.write_dataframe(df, 'dataframe', fid) test = {k: v for k, v in fid['dataframe'].attrs.items()} self.assertDictEqual(test, attrs)
def test_dataframe_attributes(self): """ Test the attributes that get created for a dataframe. """ attrs = { "CLASS": "TABLE", "FIELD_0_NAME": "index", "FIELD_1_NAME": "float_data", "FIELD_2_NAME": "integer_data", "TITLE": "Table", "VERSION": "0.2", "float_data_dtype": "float64", "index_dtype": "int64", "index_names": numpy.array(["index"], dtype=object), "integer_data_dtype": "int64", "metadata": "`Pandas.DataFrame` converted to HDF5 compound datatype.", # pylint: disable=line-too-long # noqa: E501 "nrows": 10, "python_type": "`Pandas.DataFrame`", } df = pandas.DataFrame(self.table_data) fname = "test_dataframe_attributes.h5" with h5py.File(fname, "w", **self.memory_kwargs) as fid: hdf5.write_dataframe(df, "dataframe", fid) test = {k: v for k, v in fid["dataframe"].attrs.items()} self.assertDictEqual(test, attrs)
def convert(aerosol_path, output_filename, compression, filter_opts): """ Converts all the .pix and .cmp files found in `aerosol_path` to a HDF5 file. """ # define a case switch func = {'pix': read_pix, 'cmp': read_cmp} # create the output file with h5py.File(output_filename, 'w') as fid: pattern = ['*.pix', '*.cmp'] for p in pattern: search = pjoin(aerosol_path, p) files = glob.glob(search) for fname in files: pth, ext = splitext(fname) ext = ext.split(".")[-1] grp_name = basename(pth) out_path = ppjoin(ext, grp_name) # read/write df, extents = func[ext](fname) attrs = {'extents': wkt.dumps(extents), 'source filename': fname} write_dataframe(df, out_path, fid, compression=compression, attrs=attrs, filter_opts=filter_opts)
def table_results(table_group, compression=H5CompressionFilter.LZF, filter_opts=None): """ Combine the residual results of each TABLE Dataset into a single TABLE Dataset. """ # potentially could just use visit... paths = find(table_group, 'TABLE') equivalent = [] products = [] name = [] for pth in paths: dset = table_group[pth] equivalent.append(dset.attrs['equal']) products.append(pbasename(dset.parent.name)) name.append(pbasename(dset.name)) df = pandas.DataFrame({ 'product': products, 'dataset_name': name, 'equivalent': equivalent }) # output write_dataframe(df, 'TABLE-EQUIVALENCY', table_group, compression, title='EQUIVALENCY-RESULTS', filter_opts=filter_opts)
def scalar_results(scalar_group, compression=H5CompressionFilter.LZF, filter_opts=None): """ Combine the residual results of each SCALAR Dataset into a single TABLE Dataset. """ # potentially could just use visit... paths = find(scalar_group, "SCALAR") equivalent = [] products = [] name = [] for pth in paths: dset = scalar_group[pth] equivalent.append(dset[()]) products.append(pbasename(dset.parent.name)) name.append(pbasename(dset.name)) df = pandas.DataFrame( {"product": products, "dataset_name": name, "equivalent": equivalent} ) # output write_dataframe( df, "SCALAR-EQUIVALENCY", scalar_group, compression, title="EQUIVALENCY-RESULTS", filter_opts=filter_opts, )
def run(aerosol_path, output_filename): """ Converts all the .pix and .cmp files found in `aerosol_path` to a HDF5 file. """ # define a case switch func = {"pix": read_pix, "cmp": read_cmp} # create the output file fid = h5py.File(output_filename, "w") pattern = ["*.pix", "*.cmp"] for p in pattern: search = pjoin(aerosol_path, p) files = glob.glob(search) for fname in files: pth, ext = splitext(fname) ext = ext.split(".")[-1] grp_name = basename(pth) out_path = ppjoin(ext, grp_name) # read/write df, extents = func[ext](fname) attrs = {"extents": wkt.dumps(extents), "source filename": fname} write_dataframe(df, out_path, fid, attrs=attrs) fid.close()
def image_results(image_group, compression=H5CompressionFilter.LZF, filter_opts=None): """ Combine the residual results of each IMAGE Dataset into a single TABLE Dataset. """ # potentially could just use visit... img_paths = find(image_group, 'IMAGE') min_ = [] max_ = [] percent = [] pct_90 = [] pct_99 = [] resid_paths = [] hist_paths = [] chist_paths = [] products = [] name = [] for pth in img_paths: hist_pth = pth.replace('RESIDUALS', 'FREQUENCY-DISTRIBUTIONS') chist_pth = pth.replace('RESIDUALS', 'CUMULATIVE-DISTRIBUTIONS') resid_paths.append(ppjoin(image_group.name, pth)) hist_paths.append(ppjoin(image_group.name, hist_pth)) chist_paths.append(ppjoin(image_group.name, chist_pth)) dset = image_group[pth] min_.append(dset.attrs['min_residual']) max_.append(dset.attrs['max_residual']) percent.append(dset.attrs['percent_difference']) products.append(pbasename(dset.parent.name)) name.append(pbasename(dset.name)) dset = image_group[chist_pth] pct_90.append(dset.attrs['90th_percentile']) pct_99.append(dset.attrs['99th_percentile']) df = pandas.DataFrame({ 'product': products, 'dataset_name': name, 'min_residual': min_, 'max_residual': max_, 'percent_difference': percent, '90th_percentile': pct_90, '99th_percentile': pct_99, 'residual_image_pathname': resid_paths, 'residual_histogram_pathname': hist_paths, 'residual_cumulative_pathname': chist_paths }) # output write_dataframe(df, 'IMAGE-RESIDUALS', image_group, compression, title='RESIDUALS-TABLE', filter_opts=filter_opts)
def query( outdir, product_name_test, product_name_reference, db_env_test, db_env_reference, time, lon, lat, additional_filters, ): """ Database querying of test and reference products. """ outdir = Path(outdir) log_fname = outdir.joinpath(DirectoryNames.LOGS.value, LogNames.QUERY.value) if not log_fname.parent.exists(): log_fname.parent.mkdir(parents=True) with open(log_fname, "w") as fobj: structlog.configure(logger_factory=structlog.PrintLoggerFactory(fobj), processors=LOG_PROCESSORS) results = query_products( product_name_test, product_name_reference, db_env_test, db_env_reference, time, lon, lat, additional_filters, ) results_fname = outdir.joinpath(DirectoryNames.RESULTS.value, FileNames.RESULTS.value) dataset_name = DatasetNames.QUERY.value _LOG.info( "saving results of query", out_fname=str(results_fname), dataset_name=dataset_name, ) with h5py.File(str(results_fname), "w") as fid: write_dataframe(results, dataset_name, fid)
def test_dataframe_roundtrip(self): """ Test that the pandas dataframe roundtrips, i.e. save to HDF5 and is read back into a dataframe seamlessly. Float, integer, datetime and string datatypes will be tested. """ df = pandas.DataFrame(self.table_data) df['timestamps'] = pandas.date_range('1/1/2000', periods=10, freq='D') df['string_data'] = ['period {}'.format(i) for i in range(10)] fname = 'test_dataframe_roundtrip.h5' with h5py.File(fname, **self.memory_kwargs) as fid: hdf5.write_dataframe(df, 'dataframe', fid) self.assertTrue(df.equals(hdf5.read_h5_table(fid, 'dataframe')))
def convert(aerosol_path, out_h5: h5py.Group, compression, filter_opts): """ Converts all the .pix and .cmp files found in `aerosol_path` to a HDF5 file. """ # define a case switch func = {"pix": read_pix, "cmp": read_cmp} dataset_names = [] metadata = [] pattern = ["*.pix", "*.cmp"] for p in pattern: for search_path in aerosol_path.glob(p): _path = search_path.resolve() fname, ext = _path.stem, _path.suffix[ 1:] # exclude the period from ext out_path = ppjoin(ext, fname) df, extents = func[ext](_path) # read/write df, extents = func[ext](_path) # src checksum; used to help derive fallback uuid with _path.open("rb") as src: src_checksum = generate_md5sum(src).hexdigest() attrs = { "extents": wkt.dumps(extents), "source filename": str(_path) } write_dataframe( df, out_path, out_h5, compression=compression, attrs=attrs, filter_opts=filter_opts, ) dataset_names.append(out_path) metadata.append({ "id": str( generate_fallback_uuid(PRODUCT_HREF, path=str(_path.stem), md5=src_checksum)) }) return metadata, dataset_names
def _convert_4d(rds, fid, dataset_name, compression, filter_opts): """ Private routine for converting the multiples of 37 layer atmospheric data in the GRIB file to HDF5. For a months worth of data, the dimensions become: * (day, atmospheric level, y, x) """ attrs = { "geotransform": rds.transform.to_gdal(), "crs_wkt": rds.crs.wkt, "history": "Converted to HDF5", } # band groups of 37, nrows to process (ytile) band_groups = range(1, rds.count + 1, 37) ytile = filter_opts["chunks"][2] dims = (len(band_groups), 37, rds.height, rds.width) tiles = generate_tiles(rds.width, rds.height, rds.width, ytile) # dataset creation options kwargs = compression.config(**filter_opts).dataset_compression_kwargs() kwargs["shape"] = dims kwargs["dtype"] = rds.dtypes[0] dataset = fid.create_dataset(dataset_name, **kwargs) attach_image_attributes(dataset, attrs) # add dimension labels, but should we also include dimension scales? dataset.dims[0].label = "Day" dataset.dims[1].label = "Atmospheric Level" dataset.dims[2].label = "Y" dataset.dims[3].label = "X" # process by spatial tile containing 37 atmospheric layers for 1 day for i, bg in enumerate(band_groups): bands = list(range(bg, bg + 37)) for tile in tiles: idx = ( slice(i, bg), slice(None), slice(tile[0][0], tile[0][1]), slice(tile[1][0], tile[1][1]), ) dataset[idx] = rds.read(bands, window=tile) # metadata metadata = metadata_dataframe(rds) write_dataframe(metadata, "METADATA", fid, compression)
def test_write_dataframe(self): """ Test the write_dataframe function. """ df = pandas.DataFrame(self.table_data) fname = "test_write_dataframe.h5" with h5py.File(fname, "w", **self.memory_kwargs) as fid: self.assertIsNone(hdf5.write_dataframe(df, "dataframe", fid))
def query_filesystem( outdir, product_pathname_test, product_pathname_reference, glob_pattern_test, glob_pattern_reference, ): """ Filesystem querying of test and reference products. """ outdir = Path(outdir) log_fname = outdir.joinpath(DirectoryNames.LOGS.value, LogNames.QUERY.value) if not log_fname.parent.exists(): log_fname.parent.mkdir(parents=True) with open(log_fname, "w") as fobj: structlog.configure(logger_factory=structlog.PrintLoggerFactory(fobj)) results = query_via_filepath( product_pathname_test, product_pathname_reference, glob_pattern_test, glob_pattern_reference, ) results_fname = outdir.joinpath(DirectoryNames.RESULTS.value, FileNames.RESULTS.value) dataset_name = DatasetNames.QUERY.value _LOG.info( "saving results of query", out_fname=str(results_fname), dataset_name=dataset_name, ) if not results_fname.parent.exists(): results_fname.parent.mkdir(parents=True) with h5py.File(str(results_fname), "w") as fid: write_dataframe(results, dataset_name, fid)
def _convert_3d(rds, fid, dataset_name, compression, filter_opts): """ Private routine for converting the 37 layer atmospheric data in the GRIB file to HDF5. """ # basic metadata to attach to the dataset attrs = { 'geotransform': rds.transform.to_gdal(), 'crs_wkt': rds.crs.wkt, 'history': 'Converted to HDF5' } # bands list, nrows to process (ytile) bands = list(range(1, rds.count + 1)) ytile = filter_opts['chunks'][1] dims = (rds.count, rds.height, rds.width) # dataset creation options kwargs = compression.config(**filter_opts).dataset_compression_kwargs() kwargs['shape'] = dims kwargs['dtype'] = rds.dtypes[0] dataset = fid.create_dataset(dataset_name, **kwargs) attach_image_attributes(dataset, attrs) # add dimension labels, but should we also include dimension scales? dataset.dims[0].label = 'Atmospheric Level' dataset.dims[1].label = 'Y' dataset.dims[2].label = 'X' # process by tile for tile in generate_tiles(rds.width, rds.height, rds.width, ytile): idx = ( slice(None), slice(tile[0][0], tile[0][1]), slice(tile[1][0], tile[1][1]) ) dataset[idx] = rds.read(bands, window=tile) # metadata metadata = metadata_dataframe(rds) write_dataframe(metadata, 'METADATA', fid, compression)
def _convert_2d(rds, fid, dataset_name, compression, filter_opts): """ Private routine for converting the 2D GRIB file to HDF5. """ attrs = { 'geotransform': rds.transform.to_gdal(), 'crs_wkt': rds.crs.wkt, 'history': 'Converted to HDF5' } data = rds.read(1) write_h5_image(data, dataset_name, fid, compression, attrs, filter_opts) # add dimension labels, but should we also include dimension scales? dataset = fid[dataset_name] dataset.dims[0].label = 'Y' dataset.dims[1].label = 'X' # metadata metadata = metadata_dataframe(rds) write_dataframe(metadata, 'METADATA', fid, compression)
def _convert_2d(rds, fid, dataset_name, compression, filter_opts): """ Private routine for converting the 2D GRIB file to HDF5. """ attrs = { "geotransform": rds.transform.to_gdal(), "crs_wkt": rds.crs.wkt, "history": "Converted to HDF5", } data = rds.read(1) write_h5_image(data, dataset_name, fid, compression, attrs, filter_opts) # add dimension labels, but should we also include dimension scales? dataset = fid[dataset_name] dataset.dims[0].label = "Y" dataset.dims[1].label = "X" # metadata metadata = metadata_dataframe(rds) write_dataframe(metadata, "METADATA", fid, compression)
def test_dataframe_roundtrip(self): """ Test that the pandas dataframe roundtrips, i.e. save to HDF5 and is read back into a dataframe seamlessly. Float, integer, datetime and string datatypes will be tested. """ df = pandas.DataFrame(self.table_data) df["timestamps"] = pandas.date_range("1/1/2000", periods=10, freq="D", tz="UTC") df["string_data"] = ["period {}".format(i) for i in range(10)] fname = "test_dataframe_roundtrip.h5" with h5py.File(fname, "w", **self.memory_kwargs) as fid: hdf5.write_dataframe(df, "dataframe", fid) # Apply conversion to no timezone that occurs in serialisation to hdf5 # Numpy is timezone naive; pandas has timezone support df["timestamps"] = df["timestamps"].dt.tz_convert(None) self.assertTrue(df.equals(hdf5.read_h5_table(fid, "dataframe")))
def comparison(outdir: Union[str, Path], proc_info: bool) -> None: """ Test and Reference product intercomparison evaluation. """ outdir = Path(outdir) if proc_info: log_fname = outdir.joinpath(DirectoryNames.LOGS.value, LogNames.PROC_INFO_INTERCOMPARISON.value) else: log_fname = outdir.joinpath(DirectoryNames.LOGS.value, LogNames.MEASUREMENT_INTERCOMPARISON.value) out_stream = MPIStreamIO(str(log_fname)) structlog.configure(processors=DEFAULT_PROCESSORS, logger_factory=MPILoggerFactory(out_stream)) # processor info rank = COMM.Get_rank() n_processors = COMM.Get_size() results_fname = outdir.joinpath(DirectoryNames.RESULTS.value, FileNames.RESULTS.value) with h5py.File(str(results_fname), "r") as fid: dataframe = read_h5_table(fid, DatasetNames.QUERY.value) if rank == 0: index = dataframe.index.values.tolist() blocks = scatter(index, n_processors) # some basic attribute information doc: Union[Granule, None] = load_odc_metadata( Path(dataframe.iloc[0].yaml_pathname_reference)) attrs: Dict[str, Any] = { "framing": doc.framing, "thematic": False, "proc-info": False, } else: blocks = None doc = None attrs = dict() COMM.Barrier() # equally partition the work across all procesors indices = COMM.scatter(blocks, root=0) if proc_info: attrs["proc-info"] = True if rank == 0: _LOG.info("procssing proc-info documents") gqa_dataframe, ancillary_dataframe = _process_proc_info( dataframe.iloc[indices], rank) if rank == 0: _LOG.info("saving gqa dataframe results to tables") if not results_fname.parent.exists(): results_fname.parent.mkdir(parents=True) with h5py.File(str(results_fname), "a") as fid: dataset_name = PPath(DatasetGroups.INTERCOMPARISON.value, DatasetNames.GQA_RESULTS.value) write_dataframe(gqa_dataframe, str(dataset_name), fid, attrs=attrs) _LOG.info("saving ancillary dataframe results to tables") if not results_fname.parent.exists(): results_fname.parent.mkdir(parents=True) with h5py.File(str(results_fname), "a") as fid: dataset_name = PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.ANCILLARY_RESULTS.value, ) write_dataframe(ancillary_dataframe, str(dataset_name), fid, attrs=attrs) _LOG.info("saving software versions dataframe to tables") with h5py.File(str(results_fname), "a") as fid: dataset_name = PPath(DatasetNames.SOFTWARE_VERSIONS.value) software_attrs = { "description": "ARD Pipeline software versions" } software_df = compare_proc_info.compare_software(dataframe) write_dataframe(software_df, str(dataset_name), fid, attrs=software_attrs) else: if rank == 0: _LOG.info("processing odc-metadata documents") results = _process_odc_doc(dataframe.iloc[indices], rank) if rank == 0: # save each table _LOG.info("saving dataframes to tables") with h5py.File(str(results_fname), "a") as fid: attrs["thematic"] = False write_dataframe( results[0], str( PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.GENERAL_RESULTS.value, )), fid, attrs=attrs, ) attrs["thematic"] = True write_dataframe( results[1], str( PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.FMASK_RESULTS.value, )), fid, attrs=attrs, ) write_dataframe( results[2], str( PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.CONTIGUITY_RESULTS.value, )), fid, attrs=attrs, ) write_dataframe( results[3], str( PPath( DatasetGroups.INTERCOMPARISON.value, DatasetNames.SHADOW_RESULTS.value, )), fid, attrs=attrs, ) if rank == 0: workflow = "proc-info field" if proc_info else "product measurement" msg = f"{workflow} comparison processing finished" _LOG.info(msg)
def image_results(image_group, compression=H5CompressionFilter.LZF, filter_opts=None): """ Combine the residual results of each IMAGE Dataset into a single TABLE Dataset. """ # potentially could just use visit... img_paths = find(image_group, "IMAGE") min_ = [] max_ = [] percent = [] pct_90 = [] pct_99 = [] resid_paths = [] hist_paths = [] chist_paths = [] products = [] name = [] for pth in img_paths: hist_pth = pth.replace("RESIDUALS", "FREQUENCY-DISTRIBUTIONS") chist_pth = pth.replace("RESIDUALS", "CUMULATIVE-DISTRIBUTIONS") resid_paths.append(ppjoin(image_group.name, pth)) hist_paths.append(ppjoin(image_group.name, hist_pth)) chist_paths.append(ppjoin(image_group.name, chist_pth)) dset = image_group[pth] min_.append(dset.attrs["min_residual"]) max_.append(dset.attrs["max_residual"]) percent.append(dset.attrs["percent_difference"]) products.append(pbasename(dset.parent.name)) name.append(pbasename(dset.name)) dset = image_group[chist_pth] pct_90.append(dset.attrs["90th_percentile"]) pct_99.append(dset.attrs["99th_percentile"]) df = pandas.DataFrame( { "product": products, "dataset_name": name, "min_residual": min_, "max_residual": max_, "percent_difference": percent, "90th_percentile": pct_90, "99th_percentile": pct_99, "residual_image_pathname": resid_paths, "residual_histogram_pathname": hist_paths, "residual_cumulative_pathname": chist_paths, } ) # output write_dataframe( df, "IMAGE-RESIDUALS", image_group, compression, title="RESIDUALS-TABLE", filter_opts=filter_opts, )
def convert_file(fname, out_h5: h5py.Group, compression, filter_opts: Optional[Dict] = None): """ Convert a PR_WTR NetCDF file into HDF5. :param fname: A str containing the PR_WTR filename. :param out_fname: A h5py.Group to write output datasets to :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: None. Content is written directly to disk. """ with rasterio.open(fname) as ds: name_fmt = "BAND-{}" # global attributes # TODO update the history attrs # TODO remove the NC_GLOBAL str and just have plain attr names g_attrs = ds.tags() # get timestamp info origin = g_attrs.pop("time#units").replace("hours since ", "") hours = json.loads( g_attrs.pop("NETCDF_DIM_time_VALUES").replace("{", "[").replace( "}", "]")) df = pandas.DataFrame({ "timestamp": pandas.to_datetime(hours, unit="h", origin=origin), "band_name": [name_fmt.format(i + 1) for i in range(ds.count)], }) df["dataset_name"] = df.timestamp.dt.strftime("%Y/%B-%d/%H%M") df["dataset_name"] = df["dataset_name"].str.upper() # create a timestamp and band name index table dataset desc = "Timestamp and Band Name index information." attrs = {"description": desc} write_dataframe(df, "INDEX", out_h5, compression, attrs=attrs) attach_attributes(out_h5, g_attrs) # process every band for i in range(1, ds.count + 1): ds_name = df.iloc[i - 1].dataset_name # create empty or copy the user supplied filter options if not filter_opts: f_opts = dict() else: f_opts = filter_opts.copy() # band attributes # TODO remove NETCDF tags # TODO add fillvalue attr attrs = ds.tags(i) attrs["timestamp"] = df.iloc[i - 1]["timestamp"].replace( tzinfo=timezone.utc) attrs["band_name"] = df.iloc[i - 1]["band_name"] attrs["geotransform"] = ds.transform.to_gdal() attrs["crs_wkt"] = CRS.ExportToWkt() # use ds native chunks if none are provided if "chunks" not in f_opts: try: f_opts["chunks"] = ds.block_shapes[i] except IndexError: print("Chunk error: {}".format(fname)) f_opts["chunks"] = (73, 144) # write to disk as an IMAGE Class Dataset write_h5_image( ds.read(i), ds_name, out_h5, attrs=attrs, compression=compression, filter_opts=f_opts, )
def calculate_coefficients(atmospheric_results_group, out_group, compression=H5CompressionFilter.LZF, filter_opts=None): """ Calculate the atmospheric coefficients from the MODTRAN output and used in the BRDF and atmospheric correction. Coefficients are computed for each band for each each coordinate for each atmospheric coefficient. The atmospheric coefficients can be found in `Workflow.STANDARD.atmos_coefficients`. :param atmospheric_results_group: The root HDF5 `Group` that contains the atmospheric results from each MODTRAN run. :param out_group: If set to None (default) then the results will be returned as an in-memory hdf5 file, i.e. the `core` driver. Otherwise, a writeable HDF5 `Group` object. The datasets will be formatted to the HDF5 TABLE specification and the dataset names will be as follows: * DatasetName.NBAR_COEFFICIENTS (if Workflow.STANDARD or Workflow.NBAR) * DatasetName.SBT_COEFFICIENTS (if Workflow.STANDARD or Workflow.SBT) :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :param filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: An opened `h5py.File` object, that is either in-memory using the `core` driver, or on disk. """ nbar_coefficients = pd.DataFrame() sbt_coefficients = pd.DataFrame() channel_data = channel_solar_angle = upward = downward = None # Initialise the output group/file if out_group is None: fid = h5py.File('atmospheric-coefficients.h5', driver='core', backing_store=False) else: fid = out_group res = atmospheric_results_group npoints = res.attrs['npoints'] nbar_atmos = res.attrs['nbar_atmospherics'] sbt_atmos = res.attrs['sbt_atmospherics'] for point in range(npoints): point_grp = res[POINT_FMT.format(p=point)] lonlat = point_grp.attrs['lonlat'] timestamp = pd.to_datetime(point_grp.attrs['datetime']) grp_path = ppjoin(POINT_FMT.format(p=point), ALBEDO_FMT) if nbar_atmos: channel_path = ppjoin(grp_path.format(a=Albedos.ALBEDO_0.value), DatasetName.CHANNEL.value) channel_data = read_h5_table(res, channel_path) channel_solar_angle_path = ppjoin( grp_path.format(a=Albedos.ALBEDO_0.value), DatasetName.SOLAR_ZENITH_CHANNEL.value ) channel_solar_angle = read_h5_table(res, channel_solar_angle_path) if sbt_atmos: dname = ppjoin(grp_path.format(a=Albedos.ALBEDO_TH.value), DatasetName.UPWARD_RADIATION_CHANNEL.value) upward = read_h5_table(res, dname) dname = ppjoin(grp_path.format(a=Albedos.ALBEDO_TH.value), DatasetName.DOWNWARD_RADIATION_CHANNEL.value) downward = read_h5_table(res, dname) kwargs = {'channel_data': channel_data, 'solar_zenith_angle': channel_solar_angle, 'upward_radiation': upward, 'downward_radiation': downward} result = coefficients(**kwargs) # insert some datetime/geospatial fields if result[0] is not None: result[0].insert(0, 'POINT', point) result[0].insert(1, 'LONGITUDE', lonlat[0]) result[0].insert(2, 'LATITUDE', lonlat[1]) result[0].insert(3, 'DATETIME', timestamp) nbar_coefficients = nbar_coefficients.append(result[0]) if result[1] is not None: result[1].insert(0, 'POINT', point) result[1].insert(1, 'LONGITUDE', lonlat[0]) result[1].insert(2, 'LATITUDE', lonlat[1]) result[1].insert(3, 'DATETIME', pd.to_datetime(timestamp)) sbt_coefficients = sbt_coefficients.append(result[1]) nbar_coefficients.reset_index(inplace=True) sbt_coefficients.reset_index(inplace=True) attrs = {'npoints': npoints} description = "Coefficients derived from the VNIR solar irradiation." attrs['description'] = description dname = DatasetName.NBAR_COEFFICIENTS.value if GroupName.COEFFICIENTS_GROUP.value not in fid: fid.create_group(GroupName.COEFFICIENTS_GROUP.value) group = fid[GroupName.COEFFICIENTS_GROUP.value] if nbar_atmos: write_dataframe(nbar_coefficients, dname, group, compression, attrs=attrs, filter_opts=filter_opts) description = "Coefficients derived from the THERMAL solar irradiation." attrs['description'] = description dname = DatasetName.SBT_COEFFICIENTS.value if sbt_atmos: write_dataframe(sbt_coefficients, dname, group, compression, attrs=attrs, filter_opts=filter_opts) if out_group is None: return fid
def run_modtran(acquisitions, atmospherics_group, workflow, npoints, point, albedos, modtran_exe, basedir, out_group, compression=H5CompressionFilter.LZF, filter_opts=None): """ Run MODTRAN and channel results. """ lonlat = atmospherics_group[POINT_FMT.format(p=point)].attrs['lonlat'] # determine the output group/file if out_group is None: fid = h5py.File('atmospheric-results.h5', driver='core', backing_store=False) else: fid = out_group # initial attributes base_attrs = {'Point': point, 'lonlat': lonlat, 'datetime': acquisitions[0].acquisition_datetime} base_path = ppjoin(GroupName.ATMOSPHERIC_RESULTS_GRP.value, POINT_FMT.format(p=point)) # what atmospheric calculations have been run and how many points group_name = GroupName.ATMOSPHERIC_RESULTS_GRP.value if group_name not in fid: fid.create_group(group_name) fid[group_name].attrs['npoints'] = npoints applied = workflow in (Workflow.STANDARD, Workflow.NBAR) fid[group_name].attrs['nbar_atmospherics'] = applied applied = workflow in (Workflow.STANDARD, Workflow.SBT) fid[group_name].attrs['sbt_atmospherics'] = applied acqs = acquisitions for albedo in albedos: base_attrs['Albedo'] = albedo.value workpath = pjoin(basedir, POINT_FMT.format(p=point), ALBEDO_FMT.format(a=albedo.value)) json_mod_infile = pjoin(workpath, ''.join( [POINT_ALBEDO_FMT.format(p=point, a=albedo.value), '.json'])) group_path = ppjoin(base_path, ALBEDO_FMT.format(a=albedo.value)) subprocess.check_call([modtran_exe, json_mod_infile], cwd=workpath) chn_fname = glob.glob(pjoin(workpath, '*.chn'))[0] tp6_fname = glob.glob(pjoin(workpath, '*.tp6'))[0] if albedo == Albedos.ALBEDO_TH: acq = [acq for acq in acqs if acq.band_type == BandType.THERMAL][0] channel_data = read_modtran_channel(chn_fname, tp6_fname, acq, albedo) attrs = base_attrs.copy() dataset_name = DatasetName.UPWARD_RADIATION_CHANNEL.value attrs['description'] = ('Upward radiation channel output from ' 'MODTRAN') dset_name = ppjoin(group_path, dataset_name) write_dataframe(channel_data[0], dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) # downward radiation attrs = base_attrs.copy() dataset_name = DatasetName.DOWNWARD_RADIATION_CHANNEL.value attrs['description'] = ('Downward radiation channel output from ' 'MODTRAN') dset_name = ppjoin(group_path, dataset_name) write_dataframe(channel_data[1], dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) else: acq = [acq for acq in acqs if acq.band_type == BandType.REFLECTIVE][0] # Will require updating to handle JSON output from modtran channel_data = read_modtran_channel(chn_fname, tp6_fname, acq, albedo) attrs = base_attrs.copy() dataset_name = DatasetName.CHANNEL.value attrs['description'] = 'Channel output from MODTRAN' dset_name = ppjoin(group_path, dataset_name) write_dataframe(channel_data[0], dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) # solar zenith angle at surface attrs = base_attrs.copy() dataset_name = DatasetName.SOLAR_ZENITH_CHANNEL.value attrs['description'] = 'Solar zenith angle at different atmosphere levels' dset_name = ppjoin(group_path, dataset_name) write_dataframe(channel_data[1], dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) # metadata for a given point alb_vals = [alb.value for alb in workflow.albedos] fid[base_path].attrs['lonlat'] = lonlat fid[base_path].attrs['datetime'] = acqs[0].acquisition_datetime.isoformat() fid[base_path].attrs.create('albedos', data=alb_vals, dtype=VLEN_STRING) if out_group is None: return fid
def convert_file(fname, out_fname, compression, filter_opts): """ Convert a PR_WTR NetCDF file into HDF5. :param fname: A str containing the PR_WTR filename. :param out_fname: A str containing the output filename for the HDF5 file. :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: None. Content is written directly to disk. """ with h5py.File(out_fname, 'w') as fid: with rasterio.open(fname) as ds: name_fmt = 'BAND-{}' # global attributes # TODO update the history attrs # TODO remove the NC_GLOBAL str and just have plain attr names g_attrs = ds.tags() # get timestamp info origin = g_attrs.pop('time#units').replace('hours since ', '') hours = json.loads( g_attrs.pop('NETCDF_DIM_time_VALUES').replace('{', '[').replace('}', ']') ) df = pandas.DataFrame( { 'timestamp': pandas.to_datetime(hours, unit='h', origin=origin), 'band_name': [name_fmt.format(i+1) for i in range(ds.count)] } ) df['dataset_name'] = df.timestamp.dt.strftime('%Y/%B-%d/%H%M') df['dataset_name'] = df['dataset_name'].str.upper() # create a timestamp and band name index table dataset desc = "Timestamp and Band Name index information." attrs = { 'description': desc } write_dataframe(df, 'INDEX', fid, compression, attrs=attrs) attach_attributes(fid, g_attrs) # process every band for i in range(1, ds.count + 1): ds_name = df.iloc[i-1].dataset_name # create empty or copy the user supplied filter options if not filter_opts: f_opts = dict() else: f_opts = filter_opts.copy() # band attributes # TODO remove NETCDF tags # TODO add fillvalue attr attrs = ds.tags(i) attrs['timestamp'] = df.iloc[i-1]['timestamp'] attrs['band_name'] = df.iloc[i-1]['band_name'] attrs['geotransform'] = ds.transform.to_gdal() attrs['crs_wkt'] = CRS.ExportToWkt() # use ds native chunks if none are provided if 'chunks' not in f_opts: try: f_opts['chunks'] = ds.block_shapes[i] except IndexError: print("Chunk error: {}".format(fname)) f_opts['chunks'] = (73, 144) # write to disk as an IMAGE Class Dataset write_h5_image(ds.read(i), ds_name, fid, attrs=attrs, compression=compression, filter_opts=f_opts)
def table_residual(ref_fid, test_fid, pathname, out_fid, compression=H5CompressionFilter.LZF, save_inputs=False, filter_opts=None): """ Output a residual TABLE of the numerical columns, ignoring columns with the dtype `object`. An equivalency test using `pandas.DataFrame.equals` is also undertaken which if False, requires further investigation to determine the column(s) and row(s) that are different. :param ref_fid: A h5py file object (essentially the root Group), containing the reference data. :param test_fid: A h5py file object (essentially the root Group), containing the test data. :param pathname: A `str` containing the pathname to the TABLE Dataset. :param out_fid: A h5py file object (essentially the root Group), opened for writing the output data. :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :param save_inputs: A `bool` indicating whether or not to save the input datasets used for evaluating the residuals alongside the results. Default is False. :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: None; This routine will only return None or a print statement, this is essential for the HDF5 visit routine. """ class_name = 'TABLE' ref_df = read_h5_table(ref_fid, pathname) test_df = read_h5_table(test_fid, pathname) # ignore any `object` dtype columns (mostly just strings) cols = [ col for col in ref_df.columns if ref_df[col].dtype.name != 'object' ] # difference and pandas.DataFrame.equals test df = ref_df[cols] - test_df[cols] equal = test_df.equals(ref_df) # ignored cols cols = [ col for col in ref_df.columns if ref_df[col].dtype.name == 'object' ] # output attrs = { 'description': 'Residuals of numerical columns only', 'columns_ignored': numpy.array(cols, VLEN_STRING), 'equivalent': equal } base_dname = pbasename(pathname) group_name = ref_fid[pathname].parent.name.strip('/') dname = ppjoin('RESULTS', class_name, 'RESIDUALS', group_name, base_dname) write_dataframe(df, dname, out_fid, compression, attrs=attrs, filter_opts=filter_opts) if save_inputs: # copy the reference data out_grp = out_fid.require_group(ppjoin('REFERENCE-DATA', group_name)) ref_fid.copy(ref_fid[pathname], out_grp) # copy the test data out_grp = out_fid.require_group(ppjoin('TEST-DATA', group_name)) test_fid.copy(test_fid[pathname], out_grp)
def collate(outdir: Union[str, Path]) -> None: """ Collate the results of the product comparison. Firstly the results are merged with the framing geometry, and second they're summarised. """ outdir = Path(outdir) log_fname = outdir.joinpath(DirectoryNames.LOGS.value, LogNames.COLLATE.value) if not log_fname.parent.exists(): log_fname.parent.mkdir(parents=True) with open(log_fname, "w") as fobj: structlog.configure(logger_factory=structlog.PrintLoggerFactory(fobj), processors=LOG_PROCESSORS) comparison_results_fname = outdir.joinpath( DirectoryNames.RESULTS.value, FileNames.RESULTS.value) _LOG.info("opening intercomparison results file", fname=str(comparison_results_fname)) with h5py.File(str(comparison_results_fname), "a") as fid: grp = fid[DatasetGroups.INTERCOMPARISON.value] for dataset_name in grp: _LOG.info("reading dataset", dataset_name=dataset_name) dataframe = read_h5_table(grp, dataset_name) # some important attributes framing = grp[dataset_name].attrs["framing"] thematic = grp[dataset_name].attrs["thematic"] proc_info = grp[dataset_name].attrs["proc-info"] _LOG.info( "merging results with framing", framing=framing, dataset_name=dataset_name, ) geo_dataframe = merge_framing(dataframe, framing) out_fname = outdir.joinpath( DirectoryNames.RESULTS.value, FileNames[MergeLookup[DatasetNames( dataset_name).name].value].value, ) _LOG.info("saving as GeoJSON", out_fname=str(out_fname)) geo_dataframe.to_file(str(out_fname), driver="GeoJSONSeq") _LOG.info("summarising") summary_dataframe = summarise(geo_dataframe, thematic, proc_info) out_dname = PPath( DatasetGroups.SUMMARY.value, DatasetNames[SummaryLookup[DatasetNames( dataset_name).name].value].value, ) _LOG.info("saving summary table", out_dataset_name=str(out_dname)) write_dataframe(summary_dataframe, str(out_dname), fid)
def collect_sbt_ancillary( acquisition, lonlats, ancillary_path, invariant_fname=None, out_group=None, compression=H5CompressionFilter.LZF, filter_opts=None, ): """ Collects the ancillary data required for surface brightness temperature. :param acquisition: An instance of an `Acquisition` object. :param lonlats: A `list` of tuples containing (longitude, latitude) coordinates. :param ancillary_path: A `str` containing the directory pathname to the ECMWF ancillary data. :param invariant_fname: A `str` containing the file pathname to the invariant geopotential data. :param out_group: If set to None (default) then the results will be returned as an in-memory hdf5 file, i.e. the `core` driver. Otherwise, a writeable HDF5 `Group` object. :param compression: The compression filter to use. Default is H5CompressionFilter.LZF :filter_opts: A dict of key value pairs available to the given configuration instance of H5CompressionFilter. For example H5CompressionFilter.LZF has the keywords *chunks* and *shuffle* available. Default is None, which will use the default settings for the chosen H5CompressionFilter instance. :return: An opened `h5py.File` object, that is either in-memory using the `core` driver, or on disk. """ # Initialise the output files if out_group is None: fid = h5py.File("sbt-ancillary.h5", "w", driver="core", backing_store=False) else: fid = out_group fid.attrs["sbt-ancillary"] = True dt = acquisition.acquisition_datetime description = ("Combined Surface and Pressure Layer data retrieved from " "the ECWMF catalogue.") attrs = {"description": description, "Date used for querying ECWMF": dt} for i, lonlat in enumerate(lonlats): pnt = POINT_FMT.format(p=i) # get data located at the surface dew = ecwmf_dewpoint_temperature(ancillary_path, lonlat, dt) t2m = ecwmf_temperature_2metre(ancillary_path, lonlat, dt) sfc_prs = ecwmf_surface_pressure(ancillary_path, lonlat, dt) sfc_hgt = ecwmf_elevation(invariant_fname, lonlat) sfc_rh = relative_humdity(t2m[0], dew[0]) # output the scalar data along with the attrs dname = ppjoin(pnt, DatasetName.DEWPOINT_TEMPERATURE.value) write_scalar(dew[0], dname, fid, dew[1]) dname = ppjoin(pnt, DatasetName.TEMPERATURE_2M.value) write_scalar(t2m[0], dname, fid, t2m[1]) dname = ppjoin(pnt, DatasetName.SURFACE_PRESSURE.value) write_scalar(sfc_prs[0], dname, fid, sfc_prs[1]) dname = ppjoin(pnt, DatasetName.SURFACE_GEOPOTENTIAL.value) write_scalar(sfc_hgt[0], dname, fid, sfc_hgt[1]) dname = ppjoin(pnt, DatasetName.SURFACE_RELATIVE_HUMIDITY.value) attrs = {"description": "Relative Humidity calculated at the surface"} write_scalar(sfc_rh, dname, fid, attrs) # get the data from each of the pressure levels (1 -> 1000 ISBL) gph = ecwmf_geo_potential(ancillary_path, lonlat, dt) tmp = ecwmf_temperature(ancillary_path, lonlat, dt) rh = ecwmf_relative_humidity(ancillary_path, lonlat, dt) dname = ppjoin(pnt, DatasetName.GEOPOTENTIAL.value) write_dataframe(gph[0], dname, fid, compression, attrs=gph[1], filter_opts=filter_opts) dname = ppjoin(pnt, DatasetName.TEMPERATURE.value) write_dataframe(tmp[0], dname, fid, compression, attrs=tmp[1], filter_opts=filter_opts) dname = ppjoin(pnt, DatasetName.RELATIVE_HUMIDITY.value) write_dataframe(rh[0], dname, fid, compression, attrs=rh[1], filter_opts=filter_opts) # combine the surface and higher pressure layers into a single array cols = [ "GeoPotential_Height", "Pressure", "Temperature", "Relative_Humidity" ] layers = pandas.DataFrame(columns=cols, index=range(rh[0].shape[0]), dtype="float64") layers["GeoPotential_Height"] = gph[0]["GeoPotential_Height"].values layers["Pressure"] = ECWMF_LEVELS[::-1] layers["Temperature"] = tmp[0]["Temperature"].values layers["Relative_Humidity"] = rh[0]["Relative_Humidity"].values # define the surface level df = pandas.DataFrame( { "GeoPotential_Height": sfc_hgt[0], "Pressure": sfc_prs[0], "Temperature": kelvin_2_celcius(t2m[0]), "Relative_Humidity": sfc_rh, }, index=[0], ) # MODTRAN requires the height to be ascending # and the pressure to be descending wh = (layers["GeoPotential_Height"] > sfc_hgt[0]) & (layers["Pressure"] < sfc_prs[0].round()) df = df.append(layers[wh]) df.reset_index(drop=True, inplace=True) dname = ppjoin(pnt, DatasetName.ATMOSPHERIC_PROFILE.value) write_dataframe(df, dname, fid, compression, attrs=attrs, filter_opts=filter_opts) fid[pnt].attrs["lonlat"] = lonlat if out_group is None: return fid
def run_modtran(acquisitions, atmospherics_group, workflow, npoints, point, albedos, modtran_exe, basedir, out_group, compression=H5CompressionFilter.LZF, filter_opts=None): """ Run MODTRAN and return the flux and channel results. """ lonlat = atmospherics_group[POINT_FMT.format(p=point)].attrs['lonlat'] # determine the output group/file if out_group is None: fid = h5py.File('atmospheric-results.h5', driver='core', backing_store=False) else: fid = out_group # initial attributes base_attrs = { 'Point': point, 'lonlat': lonlat, 'datetime': acquisitions[0].acquisition_datetime } base_path = ppjoin(GroupName.ATMOSPHERIC_RESULTS_GRP.value, POINT_FMT.format(p=point)) # what atmospheric calculations have been run and how many points group_name = GroupName.ATMOSPHERIC_RESULTS_GRP.value if group_name not in fid: fid.create_group(group_name) fid[group_name].attrs['npoints'] = npoints applied = workflow == Workflow.STANDARD or workflow == Workflow.NBAR fid[group_name].attrs['nbar_atmospherics'] = applied applied = workflow == Workflow.STANDARD or workflow == Workflow.SBT fid[group_name].attrs['sbt_atmospherics'] = applied acqs = acquisitions for albedo in albedos: base_attrs['Albedo'] = albedo.value workpath = pjoin(basedir, POINT_FMT.format(p=point), ALBEDO_FMT.format(a=albedo.value)) group_path = ppjoin(base_path, ALBEDO_FMT.format(a=albedo.value)) subprocess.check_call([modtran_exe], cwd=workpath) chn_fname = glob.glob(pjoin(workpath, '*.chn'))[0] if albedo == Albedos.ALBEDO_TH: acq = [acq for acq in acqs if acq.band_type == BandType.THERMAL][0] channel_data = read_modtran_channel(chn_fname, acq, albedo) # upward radiation attrs = base_attrs.copy() dataset_name = DatasetName.UPWARD_RADIATION_CHANNEL.value attrs['description'] = ('Upward radiation channel output from ' 'MODTRAN') dset_name = ppjoin(group_path, dataset_name) write_dataframe(channel_data[0], dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) # downward radiation attrs = base_attrs.copy() dataset_name = DatasetName.DOWNWARD_RADIATION_CHANNEL.value attrs['description'] = ('Downward radiation channel output from ' 'MODTRAN') dset_name = ppjoin(group_path, dataset_name) write_dataframe(channel_data[1], dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) else: acq = [ acq for acq in acqs if acq.band_type == BandType.REFLECTIVE ][0] flux_fname = glob.glob(pjoin(workpath, '*_b.flx'))[0] flux_data, altitudes = read_modtran_flux(flux_fname) channel_data = read_modtran_channel(chn_fname, acq, albedo) # ouput the flux data attrs = base_attrs.copy() dset_name = ppjoin(group_path, DatasetName.FLUX.value) attrs['description'] = 'Flux output from MODTRAN' write_dataframe(flux_data, dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) # output the altitude data attrs = base_attrs.copy() attrs['description'] = 'Altitudes output from MODTRAN' attrs['altitude_levels'] = altitudes.shape[0] attrs['units'] = 'km' dset_name = ppjoin(group_path, DatasetName.ALTITUDES.value) write_dataframe(altitudes, dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) # accumulate the solar irradiance transmittance = True if albedo == Albedos.ALBEDO_T else False response = acq.spectral_response() accumulated = calculate_solar_radiation(flux_data, response, altitudes.shape[0], transmittance) attrs = base_attrs.copy() dset_name = ppjoin(group_path, DatasetName.SOLAR_IRRADIANCE.value) description = ("Accumulated solar irradiation for point {} " "and albedo {}.") attrs['description'] = description.format(point, albedo.value) write_dataframe(accumulated, dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) attrs = base_attrs.copy() dataset_name = DatasetName.CHANNEL.value attrs['description'] = 'Channel output from MODTRAN' dset_name = ppjoin(group_path, dataset_name) write_dataframe(channel_data, dset_name, fid, compression, attrs=attrs, filter_opts=filter_opts) # metadata for a given point alb_vals = [alb.value for alb in workflow.albedos] fid[base_path].attrs['lonlat'] = lonlat fid[base_path].attrs['datetime'] = acqs[0].acquisition_datetime.isoformat() fid[base_path].attrs.create('albedos', data=alb_vals, dtype=VLEN_STRING) if out_group is None: return fid