async def create_wm_tile_set_job( dataset: str, version: str, creation_options: RasterTileSetSourceCreationOptions, job_name: str, parents: Optional[List[Job]] = None, use_resampler: bool = False, ) -> Tuple[Job, str]: asset_uri = get_asset_uri( dataset, version, AssetType.raster_tile_set, creation_options.dict(by_alias=True), "epsg:3857", ) # Create an asset record asset_options = AssetCreateIn( asset_type=AssetType.raster_tile_set, asset_uri=asset_uri, is_managed=True, creation_options=creation_options, metadata=RasterTileSetMetadata(), ).dict(by_alias=True) wm_asset_record = await create_asset(dataset, version, **asset_options) logger.debug(f"Created asset for {asset_uri}") # TODO: Consider removing the use_resampler argument and changing this # to "if creation_options.calc is None:" # Make sure to test different scenarios when done! if use_resampler: job = await create_resample_job( dataset, version, creation_options, int(creation_options.grid.strip("zoom_")), job_name, callback_constructor(wm_asset_record.asset_id), parents=parents, ) else: job = await create_pixetl_job( dataset, version, creation_options, job_name, callback_constructor(wm_asset_record.asset_id), parents=parents, ) zoom_level = int(creation_options.grid.strip("zoom_")) job = scale_batch_job(job, zoom_level) return job, asset_uri
async def static_vector_1x1_asset( dataset: str, version: str, asset_id: UUID, input_data: Dict[str, Any], ) -> ChangeLog: """Create Vector tile cache and NDJSON file as intermediate data.""" ####################### # Update asset metadata ####################### creation_options = creation_option_factory(AssetType.grid_1x1, input_data["creation_options"]) field_attributes: List[Dict[str, Any]] = await get_field_attributes( dataset, version, creation_options) grid_1x1_uri = get_asset_uri(dataset, version, AssetType.grid_1x1) await assets.update_asset( asset_id, fields=field_attributes, ) ############################ # Define jobs ############################ # Create table schema command: List[str] = [ "export_1x1_grid.sh", "-d", dataset, "-v", version, "-C", ",".join([field["field_name"] for field in field_attributes]), "-T", grid_1x1_uri, ] export_1x1_grid = PostgresqlClientJob( dataset=dataset, job_name="export_1x1_grid", job_queue=DATA_LAKE_JOB_QUEUE, command=command, memory=9000, environment=reader_secrets, callback=callback_constructor(asset_id), ) ####################### # execute jobs ####################### log: ChangeLog = await execute([export_1x1_grid]) return log
async def test_batch_failure(): dataset = "test" version = "v1.1.1" creation_options = { "source_type": "vector", "source_uri": [f"s3://{BUCKET}/{GEOJSON_NAME}"], "source_driver": "GeoJSON", "zipped": False, } async with ContextEngine("WRITE"): await datasets.create_dataset(dataset) await versions.create_version(dataset, version) new_asset = await assets.create_asset( dataset, version, asset_type="Database table", asset_uri="s3://path/to/file", creation_options=creation_options, ) job_env = writer_secrets + [ {"name": "STATUS_URL", "value": f"http://app_test:{PORT}/tasks"} ] callback = callback_constructor(new_asset.asset_id) # Can't have two parents with same name job1 = PostgresqlClientJob( dataset=dataset, job_name="job1", command=["test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"], environment=job_env, callback=callback, ) job2 = PostgresqlClientJob( dataset=dataset, job_name="job1", command=["test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"], environment=job_env, callback=callback, ) job3 = PostgresqlClientJob( dataset=dataset, job_name="job3", command=["test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}"], environment=job_env, callback=callback, parents=[job1.job_name, job2.job_name], ) message = "" try: await execute([job1, job2, job3]) except TooManyRetriesError as e: message = str(e) assert message == ""
async def raster_tile_set_asset( dataset: str, version: str, asset_id: UUID, input_data: Dict[str, Any], ) -> ChangeLog: # If being created as a source (default) asset, creation_options["source_uri"] # will be a list. When being created as an auxiliary asset, it will be None. # In the latter case we will generate one for pixETL based on the default asset, # below. co = deepcopy(input_data["creation_options"]) source_uris: Optional[List[str]] = co.get("source_uri") if source_uris is None: default_asset: ORMAsset = await get_default_asset(dataset, version) if default_asset.creation_options[ "source_type"] == RasterSourceType.raster: co["source_type"] = RasterSourceType.raster co["source_uri"] = [ tile_uri_to_tiles_geojson(default_asset.asset_uri) ] co["source_driver"] = RasterDrivers.geotiff auxiliary_assets = co.pop("auxiliary_assets", None) if auxiliary_assets: for aux_asset_id in auxiliary_assets: auxiliary_asset: ORMAsset = await get_asset(aux_asset_id) co["source_uri"].append( tile_uri_to_tiles_geojson(auxiliary_asset.asset_uri)) elif default_asset.creation_options[ "source_type"] == VectorSourceType.vector: co["source_type"] = VectorSourceType.vector creation_options = PixETLCreationOptions(**co) callback: Callback = callback_constructor(asset_id) create_raster_tile_set_job: Job = await create_pixetl_job( dataset, version, creation_options, "create_raster_tile_set", callback) log: ChangeLog = await execute([create_raster_tile_set_job]) return log
async def test_batch_scheduler(batch_client, httpd): _, logs = batch_client httpd_port = httpd.server_port ############################ # Setup test ############################ job_env = writer_secrets + [{ "name": "STATUS_URL", "value": f"http://app_test:{httpd_port}/tasks" }] batch.POLL_WAIT_TIME = 1 dataset = "test" version = "v1.1.1" input_data = { "source_type": "vector", "source_uri": [f"s3://{BUCKET}/{GEOJSON_NAME}"], "creation_options": { "src_driver": "GeoJSON", "zipped": False }, "metadata": {}, } new_asset = await create_asset(dataset, version, "Database table", "s3://path/to/file", input_data) callback = callback_constructor(new_asset.asset_id) ############################ # Test if mocking batch jobs using the different environments works ############################ job1 = PostgresqlClientJob( job_name="job1", command=[ "test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}" ], environment=job_env, callback=callback, ) job2 = GdalPythonImportJob( job_name="job2", command=[ "test_mock_s3_ogr2ogr.sh", "-d", "test", "-v", "v1.0.0", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}", "-l", "test", "-f", GEOJSON_NAME, ], environment=job_env, parents=[job1.job_name], callback=callback, ) job3 = GdalPythonExportJob( job_name="job3", command=[ "test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}" ], environment=job_env, parents=[job2.job_name], callback=callback, ) job4 = TileCacheJob( job_name="job4", command=[ "test_mock_s3_awscli.sh", "-s", f"s3://{BUCKET}/{GEOJSON_NAME}" ], environment=job_env, parents=[job3.job_name], callback=callback, ) log = await batch.execute([job1, job2, job3, job4]) assert log.status == "pending" tasks_rows = await tasks.get_tasks(new_asset.asset_id) task_ids = [str(task.task_id) for task in tasks_rows] # make sure, all jobs completed status = await poll_jobs(task_ids) assert status == "saved" check_callbacks(task_ids, httpd.server_port)
def test_jobs_model(): callback = callback_constructor(uuid4()) job = Job( dataset="test", job_name="test", job_queue="test", job_definition="test", command=["1"], environment=[{ "name": "TEST", "value": "TEST" }], vcpus=1, memory=2, attempts=1, attempt_duration_seconds=1, parents=None, callback=callback, ) assert job.environment == [ { "name": "TEST", "value": "TEST" }, { "name": "CORES", "value": "1" }, { "name": "MAX_MEM", "value": "2" }, ] job.vcpus = 45 assert job.environment == [ { "name": "TEST", "value": "TEST" }, { "name": "CORES", "value": "45" }, { "name": "MAX_MEM", "value": "2" }, ] job.memory = 100 assert job.environment == [ { "name": "TEST", "value": "TEST" }, { "name": "CORES", "value": "45" }, { "name": "MAX_MEM", "value": "100" }, ]
async def raster_tile_cache_asset( dataset: str, version: str, asset_id: UUID, input_data: Dict[str, Any], ) -> ChangeLog: """Generate Raster Tile Cache Assets.""" # TODO: Refactor to be easier to test min_zoom = input_data["creation_options"]["min_zoom"] max_zoom = input_data["creation_options"]["max_zoom"] max_static_zoom = input_data["creation_options"]["max_static_zoom"] implementation = input_data["creation_options"]["implementation"] symbology = input_data["creation_options"]["symbology"] resampling = input_data["creation_options"]["resampling"] # source_asset_id is currently required. Could perhaps make it optional # in the case that the default asset is the only one. source_asset: ORMAsset = await get_asset( input_data["creation_options"]["source_asset_id"] ) # Get the creation options from the original raster tile set asset and # overwrite settings. Make sure source_type and source_driver are set in # case it is an auxiliary asset new_source_uri = [ tile_uri_to_tiles_geojson( get_asset_uri( dataset, version, AssetType.raster_tile_set, source_asset.creation_options, ) ).replace("/geotiff", "/gdal-geotiff") ] # The first thing we do for each zoom level is reproject the source asset # to web-mercator. We don't want the calc string (if any) used to # create the source asset to be applied again to the already transformed # data, so set it to None. source_asset_co = RasterTileSetSourceCreationOptions( # TODO: With python 3.9, we can use the `|` operator here # waiting for https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker/pull/67 **{ **source_asset.creation_options, **{ "source_type": RasterSourceType.raster, "source_driver": RasterDrivers.geotiff, "source_uri": new_source_uri, "calc": None, "resampling": resampling, "compute_stats": False, "compute_histogram": False, "symbology": Symbology(**symbology), "subset": None, }, } ) # If float data type, convert to int in derivative assets for performance # FIXME: Make this work for multi-band inputs max_zoom_calc = None if source_asset_co.data_type == DataType.boolean: pass # So the next line doesn't break elif np.issubdtype(np.dtype(source_asset_co.data_type), np.floating): logger.info("Source datatype is float subtype, converting to int") source_asset_co, max_zoom_calc = convert_float_to_int( source_asset.stats, source_asset_co ) assert source_asset_co.symbology is not None symbology_function = symbology_constructor[source_asset_co.symbology.type].function # We want to make sure that the final RGB asset is named after the # implementation of the tile cache and that the source_asset name is not # already used by another intermediate asset. # TODO: Actually make sure the intermediate assets aren't going to # overwrite any existing assets if symbology_function == no_symbology: source_asset_co.pixel_meaning = implementation else: source_asset_co.pixel_meaning = ( f"{source_asset_co.pixel_meaning}_{implementation}" ) job_list: List[Job] = [] jobs_dict: Dict[int, Dict[str, Job]] = dict() for zoom_level in range(max_zoom, min_zoom - 1, -1): jobs_dict[zoom_level] = dict() if zoom_level == max_zoom: source_reprojection_parent_jobs: List[Job] = [] else: source_reprojection_parent_jobs = [ jobs_dict[zoom_level + 1]["source_reprojection_job"] ] ( source_reprojection_job, source_reprojection_uri, ) = await reproject_to_web_mercator( dataset, version, source_asset_co, zoom_level, max_zoom, source_reprojection_parent_jobs, max_zoom_resampling=PIXETL_DEFAULT_RESAMPLING, max_zoom_calc=max_zoom_calc, use_resampler=max_zoom_calc is None, ) jobs_dict[zoom_level]["source_reprojection_job"] = source_reprojection_job job_list.append(source_reprojection_job) symbology_jobs: List[Job] symbology_uri: str symbology_co = source_asset_co.copy(deep=True) symbology_jobs, symbology_uri = await symbology_function( dataset, version, implementation, symbology_co, zoom_level, max_zoom, jobs_dict, ) job_list += symbology_jobs bit_depth: int = symbology_constructor[source_asset_co.symbology.type].bit_depth if zoom_level <= max_static_zoom: tile_cache_job: Job = await create_tile_cache( dataset, version, symbology_uri, zoom_level, implementation, callback_constructor(asset_id), [*symbology_jobs, source_reprojection_job], bit_depth, ) job_list.append(tile_cache_job) log: ChangeLog = await execute(job_list) return log
async def _merge_assets( dataset: str, version: str, pixel_meaning: str, asset1_uri: str, asset2_uri: str, zoom_level: int, parents: List[Job], calc_str: str = "np.ma.array([A, B, C, D])", band_count: int = 4, ) -> Tuple[List[Job], str]: """Create RGBA-encoded raster tile set from two source assets, potentially using a custom merge function (the default works for 3+1 band sources, such as RGB + Intensity as Alpha)""" encoded_co = RasterTileSetSourceCreationOptions( pixel_meaning=pixel_meaning, data_type=DataType.uint8, # FIXME: Revisit for 16-bit assets band_count=band_count, no_data=None, resampling=ResamplingMethod.nearest, grid=Grid(f"zoom_{zoom_level}"), compute_stats=False, compute_histogram=False, source_type=RasterSourceType.raster, source_driver=RasterDrivers.geotiff, source_uri=[asset1_uri, asset2_uri], calc=calc_str, photometric=PhotometricType.rgb, ) asset_uri = get_asset_uri( dataset, version, AssetType.raster_tile_set, encoded_co.dict(by_alias=True), "epsg:3857", ) logger.debug( f"ATTEMPTING TO CREATE MERGED ASSET WITH THESE CREATION OPTIONS: {encoded_co}" ) # Create an asset record asset_options = AssetCreateIn( asset_type=AssetType.raster_tile_set, asset_uri=asset_uri, is_managed=True, creation_options=encoded_co, metadata=RasterTileSetMetadata(), ).dict(by_alias=True) asset = await create_asset(dataset, version, **asset_options) logger.debug( f"ZOOM LEVEL {zoom_level} MERGED ASSET CREATED WITH ASSET_ID {asset.asset_id}" ) callback = callback_constructor(asset.asset_id) pixetl_job = await create_pixetl_job( dataset, version, encoded_co, job_name=f"merge_assets_zoom_{zoom_level}", callback=callback, parents=parents, ) pixetl_job = scale_batch_job(pixetl_job, zoom_level) return ( [pixetl_job], tile_uri_to_tiles_geojson(asset_uri), )
async def _create_colormapped_asset( dataset: str, version: str, pixel_meaning: str, source_asset_co: RasterTileSetSourceCreationOptions, zoom_level: int, jobs_dict: Dict, ) -> Tuple[List[Job], str]: wm_source_co = source_asset_co.copy(deep=True, update={"grid": f"zoom_{zoom_level}"}) wm_source_uri: str = tile_uri_to_tiles_geojson( get_asset_uri( dataset, version, AssetType.raster_tile_set, wm_source_co.dict(by_alias=True), "epsg:3857", )) colormap_co = wm_source_co.copy( deep=True, update={ "source_uri": [wm_source_uri], "calc": None, "resampling": PIXETL_DEFAULT_RESAMPLING, "pixel_meaning": pixel_meaning, }, ) colormap_asset_uri = get_asset_uri( dataset, version, AssetType.raster_tile_set, colormap_co.dict(by_alias=True), "epsg:3857", ) # Create an asset record colormap_asset_model = AssetCreateIn( asset_type=AssetType.raster_tile_set, asset_uri=colormap_asset_uri, is_managed=True, creation_options=colormap_co, ).dict(by_alias=True) colormap_asset_record = await create_asset(dataset, version, **colormap_asset_model) logger.debug(f"Created asset record for {colormap_asset_uri} " f"with creation options: {colormap_co}") parents = [jobs_dict[zoom_level]["source_reprojection_job"]] job_name = sanitize_batch_job_name( f"{dataset}_{version}_{pixel_meaning}_{zoom_level}") # Apply the colormap gdaldem_job = await create_gdaldem_job( dataset, version, colormap_co, job_name, callback_constructor(colormap_asset_record.asset_id), parents=parents, ) gdaldem_job = scale_batch_job(gdaldem_job, zoom_level) return [gdaldem_job], colormap_asset_uri