def cli(slack_url, version: bool = False): """ Check all dead queues which the user is allowed to """ if version: click.echo(__version__) log = setup_logging() check_deadletter_queues(slack_url=slack_url, log=log)
def cli(tile_string, workdir, s3_bucket, update_metadata, s3_path): """ Example command: download-alos-palsar --tile-string 2020/N10E010 -w /tmp/download -s example-bucket -p alos_palsar_mosaic """ log = setup_logging() s3_destination = s3_bucket.rstrip("/").lstrip( "s3://") + "/" + s3_path.rstrip("/") run_one(tile_string, Path(workdir), s3_destination, update_metadata, log)
def send_messages( idx: int, queue_name: str, max_workers: int = 2, limit: int = None, slack_url: str = None, ) -> None: """ Publish a list of missing scenes to an specific queue and by the end of that it's able to notify slack the result :param limit: (int) optional limit of messages to be read from the report :param max_workers: (int) total number of pods used for the task. This number is used to split the number of scenes equally among the PODS :param idx: (int) sequential index which will be used to define the range of scenes that the POD will work with :param queue_name: (str) queue to be sens to :param slack_url: (str) Optional slack URL in case of you want to send a slack notification """ log = setup_logging() latest_report = find_latest_report( report_folder_path=S3_BUCKET_PATH, not_contains="orphaned" ) if "update" in latest_report: log.info("FORCED UPDATE FLAGGED!") log.info(f"Limited: {int(limit) if limit else 'No limit'}") log.info(f"Number of workers: {max_workers}") files = read_report_missing_scenes(report_path=latest_report, limit=limit) log.info(f"Number of scenes found {len(files)}") log.info(f"Example scenes: {files[0:10]}") # Split scenes equally among the workers split_list_scenes = split_list_equally( list_to_split=files, num_inter_lists=int(max_workers) ) # In case of the index being bigger than the number of positions in the array, the extra POD isn' necessary if len(split_list_scenes) <= idx: log.warning(f"Worker {idx} Skipped!") sys.exit(0) log.info(f"Executing worker {idx}") messages = prepare_message(scene_paths=split_list_scenes[idx], log=log) queue = get_queue(queue_name=queue_name) batch = [] failed = 0 sent = 0 error_list = [] for message in messages: try: batch.append(message) if len(batch) == 10: publish_messages(queue=queue, messages=batch) batch = [] sent += 10 except Exception as exc: failed += 1 error_list.append(exc) batch = [] if len(batch) > 0: publish_messages(queue=queue, messages=batch) sent += len(batch) environment = "DEV" if "dev" in queue_name else "PDS" error_flag = ":red_circle:" if failed > 0 else "" message = dedent( f"{error_flag}*Sentinel 2 GAP Filler - {environment}*\n" f"Sent Messages: {sent}\n" f"Failed Messages: {failed}\n" ) if slack_url is not None: send_slack_notification(slack_url, "S2 Gap Filler", message) log.info(message) if failed > 0: sys.exit(1)
def create_mosaic( dc: Datacube, product: str, out_product: str, time: Tuple[str, str], time_str: str, bands: Tuple[str], s3_output_root: str, split_bands: bool = False, resolution: int = 120, overwrite: bool = False, ): log = setup_logging() log.info(f"Creating mosaic for {product} over {time}") client = start_local_dask() assets = {} data = dc.load( product=product, time=time, resolution=(-resolution, resolution), dask_chunks={"x": 2048, "y": 2048}, measurements=bands, ) # This is a bad idea, we run out of memory # data.persist() if not split_bands: log.info("Creating a single tif file") out_file = _get_path(s3_output_root, out_product, time_str, "tif") exists = s3_head_object(out_file) is not None skip_writing = not (not exists or overwrite) try: asset, _ = _save_opinionated_cog( data, out_file, skip_writing=skip_writing, ) except ValueError: log.exception( "Failed to create COG, please check that you only have one timestep in the period." ) exit(1) assets[bands[0]] = asset if skip_writing: log.info(f"File exists, and overwrite is False. Not writing {out_file}") else: log.info(f"Finished writing: {asset.href}") else: log.info("Creating multiple tif files") for band in bands: out_file = _get_path( s3_output_root, out_product, time_str, "tif", band=band ) exists = s3_head_object(out_file) is not None skip_writing = not (not exists or overwrite) try: asset, band = _save_opinionated_cog( data=data, out_file=out_file, band=band, skip_writing=skip_writing, ) except ValueError: log.exception( "Failed to create COG, please check that you only have one timestep in the period." ) exit(1) assets[band] = asset if skip_writing: log.info(f"File exists, and overwrite is False. Not writing {out_file}") else: log.info(f"Finished writing: {asset.href}") # Aggressively heavy handed, but we get memory leaks otherwise client.restart() out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json") item = create_stac_item( assets[bands[0]].href, id=f"{product}_{time_str}", assets=assets, with_proj=True, properties={ "odc:product": out_product, "start_datetime": f"{time[0]}T00:00:00Z", "end_datetime": f"{time[1]}T23:59:59Z", }, ) item.set_self_href(out_stac_file) log.info(f"Writing STAC: {out_stac_file}") client = s3_client(aws_unsigned=False) s3_dump( data=json.dumps(item.to_dict(), indent=2), url=item.self_href, ACL="bucket-owner-full-control", ContentType="application/json", s3=client, )
def download_gls(year: str, s3_dst: str, workdir: Path, overwrite: bool = False): log = setup_logging() assets = {} out_stac = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}.stac-item.json" if s3_head_object(str(out_stac)) is not None and not overwrite: log.info(f"{out_stac} exists, skipping") return # Download the files for name, file in FILES.items(): # Create a temporary directory to work with with TemporaryDirectory(prefix=workdir) as tmpdir: log.info(f"Working on {file}") url = URL( BASE_URL.format( record_id=YEARS[year][1], year_key=YEARS[year][0], file=file ) ) dest_url = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}_{name}.tif" if s3_head_object(str(dest_url)) is None or overwrite: log.info(f"Downloading {url}") try: local_file = Path(tmpdir) / str(url.name) # Download the file download_file(url, local_file) log.info(f"Downloaded file to {local_file}") local_file_small = translate_file_deafrica_extent(local_file) log.info(f"Clipped Africa out and saved to {local_file_small}") resampling = "nearest" if name in DO_NEAREST else "bilinear" # Create a COG in memory and upload to S3 with MemoryFile() as mem_dst: # Creating the COG, with a memory cache and no download. Shiny. cog_translate( local_file_small, mem_dst.name, cog_profiles.get("deflate"), in_memory=True, nodata=255, overview_resampling=resampling, ) mem_dst.seek(0) s3_dump(mem_dst, str(dest_url), ACL="bucket-owner-full-control") log.info(f"File written to {dest_url}") except Exception: log.exception(f"Failed to process {url}") exit(1) else: log.info(f"{dest_url} exists, skipping") assets[name] = pystac.Asset( href=str(dest_url), roles=["data"], media_type=pystac.MediaType.COG ) # Write STAC document from the last-written file source_doc = f"https://zenodo.org/record/{YEARS[year][1]}" item = create_stac_item( str(dest_url), id=str(odc_uuid("Copernicus Global Land Cover", "3.0.1", [source_doc])), assets=assets, with_proj=True, properties={ "odc:product": PRODUCT_NAME, "start_datetime": f"{year}-01-01T00:00:00Z", "end_datetime": f"{year}-12-31T23:59:59Z", }, ) item.add_links( [ pystac.Link( target=source_doc, title="Source", rel=pystac.RelType.DERIVED_FROM, media_type="text/html", ) ] ) s3_dump( json.dumps(item.to_dict(), indent=2), str(out_stac), ContentType="application/json", ACL="bucket-owner-full-control", ) log.info(f"STAC written to {out_stac}")
import boto3 import click as click from datetime import datetime from datetime import timedelta from deafrica.utils import setup_logging from kubernetes import config, client # Set log level to info log = setup_logging() def delete_volumes(cluster_name, dryrun): """ Cleanup sandbox unused Volumes, k8s PVs & PVCs by looking into CloudTrail "AttachVolume" events over the past 90 days """ # configure kubernetes API client try: config.load_incluster_config() except config.ConfigException: try: config.load_kube_config() except config.ConfigException: log.exception("Could not configure kubernetes python client") k8s_api = client.CoreV1Api() k8s_namespace = "sandbox" # configure boto3 client ec2_resource = boto3.resource("ec2") ct_client = boto3.client("cloudtrail")
def generate_buckets_diff( bucket_name: str, update_stac: bool = False, notification_url: str = None, ) -> None: """ Compare Sentinel-2 buckets in US and Africa and detect differences A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report :param bucket_name: (str) Bucket where the gap report is :param update_stac: (bool) Define if the report will contain all scenes from the source for an update :param notification_url: (str) Optional slack URL in case of you want to send a slack notification """ log = setup_logging() log.info("Task started") # defines where the report will be saved s2_status_report_path = URL(f"s3://{bucket_name}/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" log.info(f"Environment {environment}") date_string = datetime.now().strftime("%Y-%m-%d") # Retrieve keys from inventory bucket source_keys = get_and_filter_cogs_keys() output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys) orphaned_keys = set() else: destination_keys = set(ns.Key for ns in list_inventory( manifest=f"{SENTINEL_2_INVENTORY_PATH}", prefix=BASE_FOLDER_NAME, contains=".json", n_threads=200, )) # Keys that are missing, they are in the source but not in the bucket missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys if key not in destination_keys) # Keys that are lost, they are in the bucket but not found in the source orphaned_keys = destination_keys.difference(source_keys) s2_s3 = s3_client(region_name=SENTINEL_2_REGION) if len(missing_scenes) > 0 or len(orphaned_keys) > 0: output_filename = (f"{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")) log.info( f"File will be saved in {s2_status_report_path}/{output_filename}") missing_orphan_scenes_json = json.dumps({ "orphan": list(orphaned_keys), "missing": list(missing_scenes) }) s3_dump( data=missing_orphan_scenes_json, url=str(URL(s2_status_report_path) / output_filename), s3=s2_s3, ContentType="application/json", ) report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}" message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n" f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_keys)}\n" f"Report: {report_http_link}\n") log.info(message) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_keys) > 200): if notification_url is not None: send_slack_notification(notification_url, "S2 Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")
def download_cci_lc(year: str, s3_dst: str, workdir: str, overwrite: bool = False): log = setup_logging() assets = {} cci_lc_version = get_version_from_year(year) name = f"{PRODUCT_NAME}_{year}_{cci_lc_version}" out_cog = URL(s3_dst) / year / f"{name}.tif" out_stac = URL(s3_dst) / year / f"{name}.stac-item.json" if s3_head_object(str(out_stac)) is not None and not overwrite: log.info(f"{out_stac} exists, skipping") return workdir = Path(workdir) if not workdir.exists(): workdir.mkdir(parents=True, exist_ok=True) # Create a temporary directory to work with tmpdir = mkdtemp(prefix=str(f"{workdir}/")) log.info(f"Working on {year} in the path {tmpdir}") if s3_head_object(str(out_cog)) is None or overwrite: log.info(f"Downloading {year}") try: local_file = Path(tmpdir) / f"{name}.zip" if not local_file.exists(): # Download the file c = cdsapi.Client() # We could also retrieve the object metadata from the CDS. # e.g. f = c.retrieve("series",{params}) | f.location = URL to download c.retrieve( "satellite-land-cover", { "format": "zip", "variable": "all", "version": cci_lc_version, "year": str(year), }, local_file, ) log.info(f"Downloaded file to {local_file}") else: log.info( f"File {local_file} exists, continuing without downloading" ) # Unzip the file log.info(f"Unzipping {local_file}") unzipped = None with zipfile.ZipFile(local_file, "r") as zip_ref: unzipped = local_file.parent / zip_ref.namelist()[0] zip_ref.extractall(tmpdir) # Process data ds = xr.open_dataset(unzipped) # Subset to Africa ulx, uly, lrx, lry = AFRICA_BBOX # Note: lats are upside down! ds_small = ds.sel(lat=slice(uly, lry), lon=slice(ulx, lrx)) ds_small = assign_crs(ds_small, crs="epsg:4326") # Create cog (in memory - :mem: returns bytes object) mem_dst = write_cog( ds_small.lccs_class, ":mem:", nodata=0, overview_resampling="nearest", ) # Write to s3 s3_dump(mem_dst, str(out_cog), ACL="bucket-owner-full-control") log.info(f"File written to {out_cog}") except Exception: log.exception(f"Failed to process {name}") exit(1) else: log.info(f"{out_cog} exists, skipping") assets["classification"] = pystac.Asset(href=str(out_cog), roles=["data"], media_type=pystac.MediaType.COG) # Write STAC document source_doc = ( "https://cds.climate.copernicus.eu/cdsapp#!/dataset/satellite-land-cover" ) item = create_stac_item( str(out_cog), id=str( odc_uuid("Copernicus Land Cover", cci_lc_version, [source_doc, name])), assets=assets, with_proj=True, properties={ "odc:product": PRODUCT_NAME, "start_datetime": f"{year}-01-01T00:00:00Z", "end_datetime": f"{year}-12-31T23:59:59Z", }, ) item.add_links([ pystac.Link( target=source_doc, title="Source", rel=pystac.RelType.DERIVED_FROM, media_type="text/html", ) ]) s3_dump( json.dumps(item.to_dict(), indent=2), str(out_stac), ContentType="application/json", ACL="bucket-owner-full-control", ) log.info(f"STAC written to {out_stac}")
def fill_the_gap( landsat: str, sync_queue_name: str, scenes_limit: Optional[int] = None, notification_url: str = None, ) -> None: """ Function to retrieve the latest gap report and create messages to the filter queue process. :param landsat:(str) satellite name :param sync_queue_name:(str) Queue name :param scenes_limit:(int) limit of how many scenes will be filled :param notification_url:(str) Slack notification URL :return:(None) """ log = setup_logging() log.info(f"Satellite: {landsat}") log.info(f"Queue: {sync_queue_name}") log.info(f"Limit: {scenes_limit if scenes_limit else 'No limit'}") log.info(f"Notification URL: {notification_url}") environment = "DEV" if "dev" in sync_queue_name else "PDS" latest_report = find_latest_report(report_folder_path=S3_BUCKET_PATH, contains=landsat) if not latest_report: raise RuntimeError("Report not found!") update_stac = False if "update" in latest_report: log.info("FORCED UPDATE FLAGGED!") update_stac = True log.info(f"Reading missing scenes from the report {latest_report}") missing_scene_paths = read_report_missing_scenes(report_path=latest_report, limit=scenes_limit) log.info(f"Number of scenes found {len(missing_scene_paths)}") log.info(f"Example scenes: {missing_scene_paths[0:10]}") returned = build_messages(missing_scene_paths=missing_scene_paths, update_stac=update_stac) messages_to_send = returned["message_list"] log.info("Publishing messages") result = post_messages(message_list=messages_to_send, queue_name=sync_queue_name) error_flag = (":red_circle:" if result["failed"] > 0 or len(returned["failed"]) > 0 else "") extra_issues = "\n".join(returned["failed"]) message = dedent( f"{error_flag}*Landsat GAP Filler - {environment}*\n" f"Sent Messages: {result['sent']}\n" f"Failed Messages: {int(result['failed']) + len(returned['failed'])}\n" f"Failed sending: {int(result['failed'])}\n" f"Other issues presented: {extra_issues}") log.info(message) if notification_url is not None and result["sent"] > 0: send_slack_notification(notification_url, "Landsat Gap Filler", message) if (int(result["failed"]) + len(returned["failed"])) > 0: sys.exit(1)
def generate_buckets_diff( bucket_name: str, satellites: str, file_name: str, update_stac: bool = False, notification_url: str = None, ): """ Compare USGS bulk files and Africa inventory bucket detecting differences A report containing missing keys will be written to AFRICA_S3_BUCKET_PATH """ log = setup_logging() start_timer = time.time() log.info("Task started") landsat_status_report_path = URL(f"s3://{bucket_name}/status-report/") landsat_status_report_url = URL( f"https://{bucket_name}.s3.af-south-1.amazonaws.com/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" title = " & ".join(satellites).replace("ls", "Landsat ") log.info(f"Environment {environment}") log.info(f"Bucket Name {bucket_name}") log.info(f"Satellites {satellites}") log.info(f"File Name {file_name}") log.info(f"Update all ({update_stac})") log.info(f"Notification URL ({notification_url})") # Create connection to the inventory S3 bucket log.info(f"Retrieving keys from inventory bucket {LANDSAT_INVENTORY_PATH}") dest_paths = get_and_filter_keys(satellites=satellites) log.info(f"INVENTORY bucket number of objects {len(dest_paths)}") log.info(f"INVENTORY 10 first {list(dest_paths)[0:10]}") date_string = datetime.now().strftime("%Y-%m-%d") # Download bulk file log.info("Download Bulk file") file_path = download_file_to_tmp(url=str(BASE_BULK_CSV_URL), file_name=file_name) # Retrieve keys from the bulk file log.info("Filtering keys from bulk file") source_paths = get_and_filter_keys_from_files(file_path) log.info(f"BULK FILE number of objects {len(source_paths)}") log.info(f"BULK 10 First {list(source_paths)[0:10]}") output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = source_paths orphaned_scenes = [] else: # collect missing scenes # missing scenes = keys that are in the bulk file but missing in PDS sync bucket and/or in source bucket log.info("Filtering missing scenes") missing_scenes = [ str(USGS_S3_BUCKET_PATH / path) for path in source_paths.difference(dest_paths) ] # collect orphan scenes # orphan scenes = keys that are in PDS sync bucket but missing in the bulk file and/or in source bucket log.info("Filtering orphan scenes") orphaned_scenes = [ str(URL(f"s3://{bucket_name}") / path) for path in dest_paths.difference(source_paths) ] log.info(f"Found {len(missing_scenes)} missing scenes") log.info(f"missing_scenes 10 first keys {list(missing_scenes)[0:10]}") log.info(f"Found {len(orphaned_scenes)} orphaned scenes") log.info( f"orphaned_scenes 10 first keys {list(orphaned_scenes)[0:10]}") landsat_s3 = s3_client(region_name="af-south-1") if len(missing_scenes) > 0 or len(orphaned_scenes) > 0: output_filename = ( (f"{title}_{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")).replace( " ", "_").replace("_&", "")) log.info( f"Report file will be saved in {landsat_status_report_path / output_filename}" ) missing_orphan_scenes_json = json.dumps({ "orphan": orphaned_scenes, "missing": missing_scenes }) s3_dump( data=missing_orphan_scenes_json, url=str(landsat_status_report_path / output_filename), s3=landsat_s3, ContentType="application/json", ) report_output = (str(landsat_status_report_url / output_filename) if len(missing_scenes) > 0 or len(orphaned_scenes) > 0 else output_filename) message = dedent(f"*{title} GAP REPORT - {environment}*\n " f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_scenes)}\n" f"Report: {report_output}\n") log.info(message) log.info( f"File {file_name} processed and sent in {time_process(start=start_timer)}" ) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_scenes) > 200): if notification_url is not None: send_slack_notification(notification_url, f"{satellites} Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")