def cli(config, inventory_manifest, queue_url, bucket, from_date, s3_keys=None):
    """
    Send messages (yaml s3 keys) to stac_queue
    """

    with open(config, "r") as cfg_file:
        cfg = YAML.load(cfg_file)

    if not s3_keys:
        s3_client = make_s3_client()
        inventory_items = list_inventory(inventory_manifest, s3=s3_client)

        if from_date:
            inventory_items = (
                item
                for item in inventory_items
                if dateutil.parser.parse(item.LastModifiedDate) > from_date
            )

        s3_keys = yamls_in_inventory_list(inventory_items, cfg)
    else:
        # Filter out non yaml keys
        s3_keys = [item for item in s3_keys if item.endswith(".yaml")]

    LOG.info("Sending %s update messages", len(s3_keys))

    messages_to_sqs(s3_keys, bucket, queue_url)

    LOG.info("Done")
Пример #2
0
def cli(inventory, prefix, regex, glob, aws_profile):
    """List S3 inventory entries.

        prefix can be combined with regex or glob pattern, but supplying both
        regex and glob doesn't make sense.

    \b
    Example:
       s3-inventory s3://my-inventory-bucket/path-to-inventory/ '*yaml'

    """
    def entry_to_url(entry):
        return 's3://{e.Bucket}/{e.Key}'.format(e=entry)

    flush_freq = 100
    s3 = make_s3_client(profile=aws_profile)

    if glob == '':
        glob = None

    if glob is not None and regex is not None:
        click.echo("Can not mix regex and shell patterns")
        sys.exit(1)

    if inventory is None:
        # TODO: read from config file
        inventory = 's3://dea-public-data-inventory/dea-public-data/dea-public-data-csv-inventory/'

    predicate = build_predicate(glob=glob, regex=regex, prefix=prefix)

    to_str = entry_to_url

    for i, entry in enumerate(list_inventory(inventory, s3=s3)):
        if predicate(entry):
            print(to_str(entry), flush=(i % flush_freq) == 0)
def update_parent_catalogs(
    bucket,
    cfg,
    from_date,
    inventory_manifest,
    contents_file,
    s3_keys=None,
    dry_run=False,
):
    if contents_file is not None:
        with open(contents_file) as fin:
            s3_keys = (line.strip() for line in fin.readlines())

    elif not s3_keys:
        s3_client = make_s3_client()
        inventory_items = list_inventory(inventory_manifest, s3=s3_client)
        if from_date:
            inventory_items = (
                item for item in inventory_items
                if dateutil.parser.parse(item.LastModifiedDate) > from_date)
        s3_keys = yamls_in_inventory_list(inventory_items, cfg)

    cu = StacCollections(cfg, dry_run)
    cu.add_items(s3_keys)
    cu.persist_all_catalogs(bucket, dry_run=dry_run)
Пример #4
0
def get_and_filter_cogs_keys():
    """
    Retrieve key list from a inventory bucket and filter
    :return:
    """

    s3 = s3_client(region_name=SOURCE_REGION)
    source_keys = list_inventory(
        manifest=f"{SOURCE_INVENTORY_PATH}",
        s3=s3,
        prefix=BASE_FOLDER_NAME,
        contains=".json",
        n_threads=200,
    )

    africa_tile_ids = set(
        pd.read_csv(
            "https://raw.githubusercontent.com/digitalearthafrica/deafrica-extent/master/deafrica-mgrs-tiles.csv.gz",
            header=None,
        ).values.ravel())

    return set(
        key.Key for key in source_keys
        if (key.Key.split("/")[-2].split("_")[1] in africa_tile_ids
            # We need to ensure we're ignoring the old format data
            and re.match(r"sentinel-s2-l2a-cogs/\d{4}/", key.Key) is None))
Пример #5
0
def cli(
    inventory,
    prefix,
    regex,
    glob,
    aws_profile,
    no_sign_request=None,
    request_payer=False,
):
    """List S3 inventory entries.

        prefix can be combined with regex or glob pattern, but supplying both
        regex and glob doesn't make sense.

    \b
    Example:
       s3-inventory s3://my-inventory-bucket/path-to-inventory/ '*yaml'

    """

    def entry_to_url(entry):
        return "s3://{e.Bucket}/{e.Key}".format(e=entry)

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    flush_freq = 100
    s3 = s3_client(profile=aws_profile, aws_unsigned=no_sign_request)

    if glob == "":
        glob = None

    if glob is not None and regex is not None:
        click.echo("Can not mix regex and shell patterns")
        sys.exit(1)

    if inventory is None:
        # TODO: read from config file
        inventory = "s3://dea-public-data-inventory/dea-public-data/dea-public-data-csv-inventory/"

    predicate = build_predicate(glob=glob, regex=regex, prefix=prefix)

    to_str = entry_to_url

    for i, entry in enumerate(list_inventory(inventory, s3=s3, **opts)):
        if predicate(entry):
            print(to_str(entry), flush=(i % flush_freq) == 0)
def delete_stac_catalog_parents(aws_product_prefix, bucket, inventory_bucket):
    s3_client = boto3.client("s3")
    delete_files = dict(Objects=[])
    for item in list_inventory(
            f"s3://{inventory_bucket}/{bucket}/{bucket}-csv-inventory/",
            s3=make_s3_client()):
        s3_key_file = PurePosixPath(item.Key)

        # add to delete list
        if s3_key_file.name == "catalog.json" and aws_product_prefix in item.Key:
            print(item.Key)
            delete_files["Objects"].append(dict(Key=item.Key))

        # flush out the delete list if aws limit reached
        if len(delete_files["Objects"]) >= AWS_DELETE_LIMIT:
            s3_client.delete_objects(Bucket=bucket, Delete=delete_files)
            delete_files = dict(Objects=[])

    # flush out the remaining
    if len(delete_files["Objects"]) >= AWS_DELETE_LIMIT:
        s3_client.delete_objects(Bucket=bucket, Delete=delete_files)
INVENTORY_BUCKET = "deafrica-sentinel-2-inventory"
PREFIX = "deafrica-sentinel-2/deafrica-sentinel-2-inventory/"

DO_FIX = False

if DO_FIX:
    client = s3_client(region_name="af-south-1")
else:
    client = s3_client(aws_unsigned=True, region_name="af-south-1")

manifest = find_latest_manifest(
    f"s3://{INVENTORY_BUCKET}/{PREFIX}",
    client,
)

inventory = list_inventory(manifest, s3=client)

report_every = 10000
count = 0

json_docs = 0
to_fix = 0

for obj in inventory:
    count += 1
    if count % report_every == 0:
        print(f"Processing {count}")
    if obj.Key.endswith(".json"):
        json_docs += 1
        o_dict = s3_head_object(f"s3://{obj.Bucket}/{obj.Key}", s3=client)
        if o_dict["ContentType"] != "application/json":
Пример #8
0
def generate_buckets_diff(
    bucket_name: str,
    update_stac: bool = False,
    notification_url: str = None,
) -> None:
    """
    Compare Sentinel-2 buckets in US and Africa and detect differences
    A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report

    :param bucket_name: (str) Bucket where the gap report is
    :param update_stac: (bool) Define if the report will contain all scenes from the source for an update
    :param notification_url: (str) Optional slack URL in case of you want to send a slack notification
    """

    log = setup_logging()

    log.info("Task started")

    # defines where the report will be saved
    s2_status_report_path = URL(f"s3://{bucket_name}/status-report/")

    environment = "DEV" if "dev" in bucket_name else "PDS"
    log.info(f"Environment {environment}")

    date_string = datetime.now().strftime("%Y-%m-%d")

    # Retrieve keys from inventory bucket
    source_keys = get_and_filter_cogs_keys()

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = set(f"s3://sentinel-cogs/{key}"
                             for key in source_keys)
        orphaned_keys = set()

    else:

        destination_keys = set(ns.Key for ns in list_inventory(
            manifest=f"{SENTINEL_2_INVENTORY_PATH}",
            prefix=BASE_FOLDER_NAME,
            contains=".json",
            n_threads=200,
        ))

        # Keys that are missing, they are in the source but not in the bucket
        missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys
                             if key not in destination_keys)

        # Keys that are lost, they are in the bucket but not found in the source
        orphaned_keys = destination_keys.difference(source_keys)

    s2_s3 = s3_client(region_name=SENTINEL_2_REGION)

    if len(missing_scenes) > 0 or len(orphaned_keys) > 0:
        output_filename = (f"{date_string}_gap_report.json" if not update_stac
                           else URL(f"{date_string}_gap_report_update.json"))

        log.info(
            f"File will be saved in {s2_status_report_path}/{output_filename}")

        missing_orphan_scenes_json = json.dumps({
            "orphan": list(orphaned_keys),
            "missing": list(missing_scenes)
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(URL(s2_status_report_path) / output_filename),
            s3=s2_s3,
            ContentType="application/json",
        )

    report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}"
    message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n"
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_keys)}\n"
                     f"Report: {report_http_link}\n")

    log.info(message)

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_keys) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url, "S2 Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")