def test_timing_logger_context_return(): logger = _Logger() timing_logger = TimingLogger("Testing", logger=logger) timing_logger._now = _fake_clock([ datetime(2019, 12, 12, 10, 10, 10, 0), datetime(2019, 12, 12, 11, 12, 13, 0) ]) with timing_logger as timer: logger("Hello world") assert timing_logger.elapsed.total_seconds() == 3723 assert timer.elapsed.total_seconds() == 3723
def test_timing_logger_custom(): logger = _Logger() timing_logger = TimingLogger("Testing", logger=logger) timing_logger._now = _fake_clock([ datetime(2019, 12, 12, 10, 10, 10, 10000), datetime(2019, 12, 12, 11, 12, 13, 14141) ]) with timing_logger: logger("Hello world") assert logger.logs == [ "Testing: start 2019-12-12 10:10:10.010000", "Hello world", "Testing: end 2019-12-12 11:12:13.014141, elapsed 1:02:03.004141" ]
def main(argv=None): logging.basicConfig(level=logging.INFO) process_graph, args = handle_cli(argv) _log.info( "Evaluating process graph: {pg}".format(pg=safe_repr(process_graph))) _setup_local_spark(print=_log.info) # Local imports to workaround the pyspark import issues. from openeo_driver.ProcessGraphDeserializer import evaluate from openeogeotrellis.backend import GeoPySparkBackendImplementation env = EvalEnv({ "version": args.api_version, "pyramid_levels": "highest", "user": None, # TODO "require_bounds": True, "correlation_id": f"cli-pid{os.getpid()}", "backend_implementation": GeoPySparkBackendImplementation(use_zookeeper=False), }) with TimingLogger(title="Evaluate process graph", logger=_log): result = evaluate(process_graph, env=env) if isinstance(result, ImageCollectionResult): filename = args.output or f"result.{result.format}" with TimingLogger(title=f"Saving result to {filename!r}", logger=_log): result.save_result(filename) elif isinstance(result, JSONResult): if args.output: with open(args.output, "w") as f: json.dump(result.prepare_for_json(), f) else: print(result.prepare_for_json()) elif isinstance(result, dict): # TODO: support storing JSON result to file print(result) else: # TODO: support more result types raise ValueError(result)
def test_timing_logger_fail(): logger = _Logger() timing_logger = TimingLogger("Testing", logger=logger) timing_logger._now = _fake_clock([ datetime(2019, 12, 12, 10, 10, 10, 10000), datetime(2019, 12, 12, 11, 12, 13, 14141) ]) try: with timing_logger: raise ValueError("Hello world") except ValueError: pass assert logger.logs == [ "Testing: start 2019-12-12 10:10:10.010000", "Testing: fail 2019-12-12 11:12:13.014141, elapsed 1:02:03.004141" ]
def _debug_show_rdd_info(self, rdd): with TimingLogger(title="Collect RDD info", logger=logger): record_count = rdd.count() key_ranges = { k: rdd.map(lambda f: f["key"][k]).distinct().collect() for k in ["col", "row", "instant"] } paths = rdd.map(lambda f: f["feature"]["id"]).distinct().count() logger.info( f"RDD info: {record_count} records, {paths} creo paths, key_ranges: {key_ranges}" )
def test_timing_logger_basic(caplog): caplog.set_level(logging.INFO) with TimingLogger("Testing"): logging.info("Hello world") pattern = re.compile(r''' .*Testing.+start.+\d{4}-\d{2}-\d{2}\ \d{2}:\d{2}:\d{2}.*\n .*Hello\ world.*\n .*Testing.*end.+\d{4}-\d{2}-\d{2}\ \d{2}:\d{2}:\d{2}.*elapsed.+[0-9.]+\n .* ''', re.VERBOSE | re.DOTALL) assert pattern.match(caplog.text)
def _creodias_dem_subset_srtm_hgt_unzip( bbox: Tuple, bbox_epsg: int, srtm_root="/eodata/auxdata/SRTMGL1/dem" ) -> tempfile.TemporaryDirectory: """ Create subset of Creodias SRTM hgt files covering the given lon-lat bbox to pass to Orfeo obtained from unzipping the necessary .SRTMGL1.hgt.zip files at /eodata/auxdata/SRTMGL1/dem/ (e.g. N50E003.SRTMGL1.hgt.zip) :return: tempfile.TemporaryDirectory to be used as context manager (for automatic cleanup) """ # Get range of lon-lat tiles to cover to_lonlat = pyproj.Transformer.from_crs(crs_from=bbox_epsg, crs_to=4326, always_xy=True) bbox_lonlat = shapely.ops.transform(to_lonlat.transform, shapely.geometry.box(*bbox)).bounds lon_min, lat_min, lon_max, lat_max = [int(b) for b in bbox_lonlat] # Unzip to temp dir temp_dir = tempfile.TemporaryDirectory(suffix="-openeo-dem-srtm") msg = f"Unzip SRTM tiles from {srtm_root} in range lon [{lon_min}:{lon_max}] x lat [{lat_min}:{lat_max}] to {temp_dir}" with TimingLogger(title=msg, logger=logger): for lon in range(lon_min, lon_max + 1): for lat in range(lat_min, lat_max + 1): # Something like: N50E003.SRTMGL1.hgt.zip" basename = "{ns}{lat:02d}{ew}{lon:03d}.SRTMGL1.hgt".format( ew="E" if lon >= 0 else "W", lon=abs(lon), ns="N" if lat >= 0 else "S", lat=abs(lat)) zip_filename = pathlib.Path(srtm_root) / (basename + '.zip') with zipfile.ZipFile(zip_filename, 'r') as z: logger.info(f"{zip_filename}: {z.infolist()}") z.extractall(temp_dir.name) return temp_dir
def _creo_scan_for_band_tiffs(creo_path: pathlib.Path, log_prefix: str) -> Dict[str, pathlib.Path]: """ Scan given creodias path for TIFF files :param creo_path: path to product root folder :param log_prefix: prefix for logging :return: dictionary mapping band name (vv, vh, ...) to tiff path """ with TimingLogger(title=f"{log_prefix} Scan {creo_path}", logger=logger): # We expect the desired geotiff files under `creo_path` at location like # measurements/s1a-iw-grd-vh-20200606t063717-20200606t063746-032893-03cf5f-002.tiff # TODO Get tiff path from manifest instead of assuming this `measurement` file structure? band_regex = re.compile(r"^s1[ab]-iw-grd-([hv]{2})-", flags=re.IGNORECASE) band_tiffs = {} for tiff in creo_path.glob("measurement/*.tiff"): match = band_regex.match(tiff.name) if match: band_tiffs[match.group(1).lower()] = tiff if not band_tiffs: raise OpenEOApiException("No tiffs found") logger.info(f"{log_prefix} Detected band tiffs: {band_tiffs}") return band_tiffs
def process_product(product: Tuple[str, List[dict]]): import faulthandler faulthandler.enable() creo_path, features = product # Short ad-hoc product id for logging purposes. prod_id = re.sub(r"[^A-Z0-9]", "", creo_path.upper())[-10:] log_prefix = f"p{os.getpid()}-prod{prod_id}" logger.info(f"{log_prefix} creo path {creo_path}") logger.info( f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}" ) creo_path = pathlib.Path(creo_path) if not creo_path.exists(): raise OpenEOApiException("Creo path does not exist") # Get whole extent of tile layout col_min = min(f["key"]["col"] for f in features) col_max = max(f["key"]["col"] for f in features) cols = col_max - col_min + 1 row_min = min(f["key"]["row"] for f in features) row_max = max(f["key"]["row"] for f in features) rows = row_max - row_min + 1 instants = set(f["key"]["instant"] for f in features) assert len(instants) == 1, f"Not single instant: {instants}" instant = instants.pop() logger.info( f"{log_prefix} Layout key extent: col[{col_min}:{col_max}] row[{row_min}:{row_max}]" f" ({cols}x{rows}={cols * rows} tiles) instant[{instant}].") layout_extent = get_total_extent(features) key_epsgs = set(f["key_epsg"] for f in features) assert len(key_epsgs) == 1, f"Multiple key CRSs {key_epsgs}" layout_epsg = key_epsgs.pop() layout_width_px = tile_size * (col_max - col_min + 1) layout_height_px = tile_size * (row_max - row_min + 1) logger.info( f"{log_prefix} Layout extent {layout_extent} EPSG {layout_epsg}:" f" {layout_width_px}x{layout_height_px}px") band_tiffs = S1BackscatterOrfeo._creo_scan_for_band_tiffs( creo_path, log_prefix) dem_dir_context = S1BackscatterOrfeo._get_dem_dir_context( sar_backscatter_arguments=sar_backscatter_arguments, extent=layout_extent, epsg=layout_epsg) msg = f"{log_prefix} Process {creo_path} " with TimingLogger(title=msg, logger=logger), dem_dir_context as dem_dir: # Allocate numpy array tile orfeo_bands = numpy.zeros( (len(bands), layout_height_px, layout_width_px), dtype=result_dtype) for b, band in enumerate(bands): if band.lower() not in band_tiffs: raise OpenEOApiException(f"No tiff for band {band}") data, nodata = S1BackscatterOrfeoV2._orfeo_pipeline( input_tiff=band_tiffs[band.lower()], extent=layout_extent, extent_epsg=layout_epsg, dem_dir=dem_dir, extent_width_px=layout_width_px, extent_height_px=layout_height_px, sar_calibration_lut=sar_calibration_lut, noise_removal=noise_removal, elev_geoid=elev_geoid, elev_default=elev_default, log_prefix=f"{log_prefix}-{band}", orfeo_memory=orfeo_memory) orfeo_bands[b] = data if sar_backscatter_arguments.options.get("to_db", False): # TODO: keep this "to_db" shortcut feature or drop it # and require user to use standard openEO functionality (`apply` based conversion)? logger.info( f"{log_prefix} Converting backscatter intensity to decibel" ) orfeo_bands = 10 * numpy.log10(orfeo_bands) # Split orfeo output in tiles logger.info( f"{log_prefix} Split {orfeo_bands.shape} in tiles of {tile_size}" ) cell_type = geopyspark.CellType(orfeo_bands.dtype.name) tiles = [] for c in range(col_max - col_min + 1): for r in range(row_max - row_min + 1): col = col_min + c row = row_min + r key = geopyspark.SpaceTimeKey( col=col, row=row, instant=_instant_ms_to_day(instant)) tile = orfeo_bands[:, r * tile_size:(r + 1) * tile_size, c * tile_size:(c + 1) * tile_size] if not (tile == nodata).all(): logger.info( f"{log_prefix} Create Tile for key {key} from {tile.shape}" ) tile = geopyspark.Tile(tile, cell_type, no_data_value=nodata) tiles.append((key, tile)) logger.info( f"{log_prefix} Layout extent split in {len(tiles)} tiles") return tiles
def process_feature(product: Tuple[str, List[dict]]): import faulthandler faulthandler.enable() creo_path, features = product prod_id = re.sub(r"[^A-Z0-9]", "", creo_path.upper())[-10:] log_prefix = f"p{os.getpid()}-prod{prod_id}" print(f"{log_prefix} creo path {creo_path}") logger.info( f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}" ) layout_extent = get_total_extent(features) key_epsgs = set(f["key_epsg"] for f in features) assert len(key_epsgs) == 1, f"Multiple key CRSs {key_epsgs}" layout_epsg = key_epsgs.pop() dem_dir_context = S1BackscatterOrfeo._get_dem_dir_context( sar_backscatter_arguments=sar_backscatter_arguments, extent=layout_extent, epsg=layout_epsg) creo_path = pathlib.Path(creo_path) band_tiffs = S1BackscatterOrfeo._creo_scan_for_band_tiffs( creo_path, log_prefix) resultlist = [] with dem_dir_context as dem_dir: for feature in features: col, row, instant = (feature["key"][k] for k in ["col", "row", "instant"]) key_ext = feature["key_extent"] key_epsg = layout_epsg logger.info( f"{log_prefix} Feature creo path: {creo_path}, key {key_ext} (EPSG {key_epsg})" ) logger.info( f"{log_prefix} sar_backscatter_arguments: {sar_backscatter_arguments!r}" ) if not creo_path.exists(): raise OpenEOApiException("Creo path does not exist") msg = f"{log_prefix} Process {creo_path} and load into geopyspark Tile" with TimingLogger(title=msg, logger=logger): # Allocate numpy array tile tile_data = numpy.zeros( (len(bands), tile_size, tile_size), dtype=result_dtype) for b, band in enumerate(bands): if band.lower() not in band_tiffs: raise OpenEOApiException( f"No tiff for band {band}") data, nodata = S1BackscatterOrfeo._orfeo_pipeline( input_tiff=band_tiffs[band.lower()], extent=key_ext, extent_epsg=key_epsg, dem_dir=dem_dir, extent_width_px=tile_size, extent_height_px=tile_size, sar_calibration_lut=sar_calibration_lut, noise_removal=noise_removal, elev_geoid=elev_geoid, elev_default=elev_default, log_prefix=f"{log_prefix}-{band}") tile_data[b] = data if sar_backscatter_arguments.options.get( "to_db", False): # TODO: keep this "to_db" shortcut feature or drop it # and require user to use standard openEO functionality (`apply` based conversion)? logger.info( f"{log_prefix} Converting backscatter intensity to decibel" ) tile_data = 10 * numpy.log10(tile_data) key = geopyspark.SpaceTimeKey( row=row, col=col, instant=_instant_ms_to_day(instant)) cell_type = geopyspark.CellType(tile_data.dtype.name) logger.info( f"{log_prefix} Create Tile for key {key} from {tile_data.shape}" ) tile = geopyspark.Tile(tile_data, cell_type, no_data_value=nodata) resultlist.append((key, tile)) return resultlist
def _orfeo_pipeline(input_tiff: pathlib.Path, extent: dict, extent_epsg: int, dem_dir: Union[str, None], extent_width_px: int, extent_height_px: int, sar_calibration_lut: str, noise_removal: bool, elev_geoid: str, elev_default: float = None, log_prefix: str = "", orfeo_memory: int = 512): logger.info(f"{log_prefix} Input tiff {input_tiff}") utm_zone, utm_northhem = utm_zone_from_epsg(extent_epsg) logger.info( f"{log_prefix} extent {extent} (UTM {utm_zone}, EPSG {extent_epsg})" ) max_total_memory_in_bytes = os.environ.get('PYTHON_MAX_MEMORY') if max_total_memory_in_bytes: set_max_memory(int(max_total_memory_in_bytes)) with TimingLogger( title=f"{log_prefix} Orfeo processing pipeline on {input_tiff}", logger=logger): arr = multiprocessing.Array(ctypes.c_double, extent_width_px * extent_height_px) ortho_rect = S1BackscatterOrfeo.configure_pipeline( dem_dir, elev_default, elev_geoid, input_tiff, log_prefix, noise_removal, orfeo_memory, sar_calibration_lut, utm_northhem, utm_zone) def run(): ortho_rect.SetParameterInt("outputs.sizex", extent_width_px) ortho_rect.SetParameterInt("outputs.sizey", extent_height_px) ortho_rect.SetParameterInt("outputs.ulx", int(extent["xmin"])) ortho_rect.SetParameterInt("outputs.uly", int(extent["ymax"])) ortho_rect.Execute() # ram = ortho_rect.PropagateRequestedRegion("io.out", myRegion) try: localdata = ortho_rect.GetImageAsNumpyArray('io.out') np.copyto( np.frombuffer(arr.get_obj()).reshape( (extent_width_px, extent_height_px)), localdata) except RuntimeError as e: logger.error( f"Error while running Orfeo toolbox. {input_tiff} {extent} EPSG {extent_epsg} {sar_calibration_lut}", exc_info=True) p = Process(target=run, args=()) p.start() p.join() if (p.exitcode == -signal.SIGSEGV): logger.error( f"Segmentation fault while running Orfeo toolbox. {input_tiff} {extent} EPSG {extent_epsg} {sar_calibration_lut}" ) data = np.reshape(np.frombuffer(arr.get_obj()), (extent_width_px, extent_height_px)) logger.info( f"{log_prefix} Final orfeo pipeline result: shape {data.shape}," f" min {numpy.nanmin(data)}, max {numpy.nanmax(data)}") return data, 0
elif isinstance(result, ImageCollectionResult): result.imagecollection.download(output_file, bbox="", time="", format=result.format, **result.options) logger.info("wrote image collection to %s" % output_file) elif isinstance(result, JSONResult): with open(output_file, 'w') as f: json.dump(result.prepare_for_json(), f) logger.info("wrote JSON result to %s" % output_file) elif isinstance(result, MultipleFilesResult): result.reduce(output_file, delete_originals=True) logger.info("reduced %d files to %s" % (len(result.files), output_file)) else: with open(output_file, 'w') as f: json.dump(result, f) logger.info("wrote JSON result to %s" % output_file) except Exception as e: logger.exception("error processing batch job") user_facing_logger.exception("error processing batch job") raise e if __name__ == '__main__': _setup_app_logging() with TimingLogger("batch_job.py main", logger=logger): main(sys.argv)