Пример #1
0
def lambda_handler(payload, context={}):
    logger.debug('Payload: %s' % json.dumps(payload))

    # if this is batch, output to stdout
    if not hasattr(context, "invoked_function_arn"):
        logger.addHandler(logging.StreamHandler())

    collections = payload.get('collections')
    index = payload.get('index', 'input_state')
    state = payload.get('state', 'FAILED')
    since = payload.get('since', None)
    limit = payload.get('limit', None)
    batch = payload.get('batch', False)
    catids = payload.get('catids', [])

    # if this is a lambda and batch is set
    if batch and hasattr(context, "invoked_function_arn"):
        submit_batch_job(payload, context.invoked_function_arn, name='rerun')
        return

    if len(catids) > 0:
        catalogs = Catalogs.from_catids(catids)
        logger.debug(f"Rerunning {len(catalogs)} catalogs")
        catids = catalogs.process(replace=True)
        logger.info(f"{len(catids)} catalogs rerun")
        return catids

    catalogs = Catalogs.from_statedb(collections, state, since, index, limit=limit)

    logger.info(f"Fetched {len(catalogs.catalogs)} catalogs")
    catids = catalogs.process(replace=True)
    logger.info(f"{len(catids)} catalogs rerun")

    return catids
Пример #2
0
def handler(event, context={}):
    # if this is batch, output to stdout
    if not hasattr(context, "invoked_function_arn"):
        logger.addHandler(logging.StreamHandler())

    logger.debug('Event: %s' % json.dumps(event))

    # parse input
    url = event.get('url')
    batch = event.get('batch', False)
    process = event['process']

    if batch and hasattr(context, "invoked_function_arn"):
        submit_batch_job(event,
                         context.invoked_function_arn,
                         definition='lambda-as-batch',
                         name='feed-stac-crawl')
        return

    cat = Catalog.from_file(url)

    for item in cat.get_all_items():
        payload = {
            'type': 'FeatureCollection',
            'features': [item.to_dict()],
            'process': process
        }
        SNS_CLIENT.publish(TopicArn=SNS_TOPIC, Message=json.dumps(payload))
Пример #3
0
def handler(payload, context={}):
    logger.debug('Payload: %s' % json.dumps(payload))

    collections = payload.get('collections')
    index = payload.get('index', 'input_state')
    state = payload.get('state', 'FAILED')
    since = payload.get('since', None)
    limit = payload.get('limit', None)
    batch = payload.get('batch', False)
    process_update = payload.get('process_update', None)
    catid_batch = 5

    # if this is a lambda and batch is set
    if batch and hasattr(context, "invoked_function_arn"):
        submit_batch_job(payload, context.invoked_function_arn, name='rerun')
        return

    items = statedb.get_items(collections, state, since, index, limit=limit)

    nitems = len(items)
    logger.debug(f"Rerunning {nitems} catalogs")

    catids = []
    for i, item in enumerate(items):
        catids.append(item['catid'])
        if (i % catid_batch) == 0:
            submit(catids, process_update=process_update)
            catids = []
        if (i % 1000) == 0:
            logger.debug(f"Queued {i} catalogs")
    if len(catids) > 0:
        submit(catids, process_update=process_update)

    return {"found": nitems}
Пример #4
0
def handler(event, context={}):
    logger.debug('Event: %s' % json.dumps(event))

    url = event.get('url')
    params = event.get('search', {})
    max_items_batch = event.get('max_items_batch', 15000)
    sleep = event.get('sleep', None)
    process = event.get('process', None)

    # search API
    search = Search(url=url, **params)
    logger.debug(f"Searching {url}")

    found = search.found()
    logger.debug(f"Total items found: {found}")

    if found <= MAX_ITEMS_REQUEST:
        return run(params, url, sleep=sleep, process=process)
    elif hasattr(context, "invoked_function_arn"):
        nbatches = int(found / max_items_batch) + 1
        if nbatches == 1:
            submit_batch_job(event,
                             context.invoked_function_arn,
                             definition='lambda-as-batch')
        else:
            for request in split_request(params, nbatches):
                event['search'] = request
                submit_batch_job(event,
                                 context.invoked_function_arn,
                                 definition='lambda-as-batch')
        logger.info(f"Submitted {nbatches} batches")
        return
    else:
        run(params, url, sleep=sleep, process=process)
Пример #5
0
def lambda_handler(event, context={}):
    logger.debug('Event: %s' % json.dumps(event))

    # if this is batch, output to stdout
    if not hasattr(context, "invoked_function_arn"):
        logger.addHandler(logging.StreamHandler())

    # parse input
    #s3urls = event['s3urls']
    #suffix = event.get('suffix', 'json')
    #credentials = event.get('credentials', {})
    #requester_pays = credentials.pop('requester_pays', False)

    ######
    url = event.get('url')
    params = event.get('search', {})
    max_items_batch = event.get('max_items_batch', 15000)
    sleep = event.get('sleep', None)

    # search API
    search = Search(api_url=url, **params)
    logger.debug(f"Searching {url}")

    found = search.found()
    logger.debug(f"Total items found: {found}")

    if found <= MAX_ITEMS_REQUEST:
        return run(params, url, sleep=sleep)
    elif hasattr(context, "invoked_function_arn"):
        nbatches = int(found / max_items_batch) + 1
        if nbatches == 1:
            submit_batch_job(event,
                             context.invoked_function_arn,
                             definition='lambda-as-batch')
        else:
            for request in split_request(params, nbatches):
                event['search'] = request
                submit_batch_job(event,
                                 context.invoked_function_arn,
                                 definition='lambda-as-batch')
        logger.info(f"Submitted {nbatches} batches")
        return
    else:
        run(params, url, sleep=sleep)
Пример #6
0
def submit_inventory_batch_jobs(inventory_url,
                                lambda_arn,
                                batch_size: int = 10,
                                max_batches: int = -1):
    urls = []
    n = 0
    for url in s3().latest_inventory_files(inventory_url):
        urls.append(url)
        if (len(urls) % batch_size) == 0:
            submit_batch_job({'inventory_files': urls}, lambda_arn)
            urls = []
            n += 1
            if max_batches > 0 and n > max_batches:
                break
    if len(urls) > 0:
        submit_batch_job({'inventory_files': urls}, lambda_arn)
        n += 1
    logger.info(f"Submitted {n} jobs")
    return n
Пример #7
0
def handler(payload, context={}):
    logger.info('Payload: %s' % json.dumps(payload))

    # get payload variables
    inventory_url = payload.pop('inventory_url', None)
    batch_size = payload.pop('batch_size', 10)
    max_batches = payload.pop('max_batches', -1)
    # required payload variable
    process = payload.pop('process')

    s3session = s3()

    # get latest inventory manifest and spawn batches (this currently assumes being run as Lambda!)
    if inventory_url is not None:
        inventory_bucket = s3session.urlparse(inventory_url)['bucket']
        # get manifest and schema
        manifest = s3session.latest_inventory_manifest(inventory_url)
        schema = manifest['fileSchema']
        if schema.startswith('struct'):
            keys = [
                str(key).strip().split(':')[0]
                for key in schema[7:-1].split(',')
            ]
        else:
            keys = [str(key).strip() for key in schema.split(',')]

        # get list of inventory files
        files = manifest.get('files')
        logger.info('Getting latest inventory (%s files) from %s' %
                    (len(files), inventory_url))

        submitted_urls = []
        njobs = 0
        for f in files:
            url = f"s3://{inventory_bucket}/{f['key']}"
            submitted_urls.append(url)
            if (len(submitted_urls) % batch_size) == 0:
                batch_payload = {
                    'inventory_files': submitted_urls,
                    'keys': keys,
                    'process': process
                }
                batch_payload.update(payload)
                submit_batch_job(batch_payload,
                                 context.invoked_function_arn,
                                 definition='lambda-as-batch',
                                 name='feed-s3-inventory')
                submitted_urls = []
                njobs += 1
                # stop if max batches reached (used for testing)
                if max_batches > 0 and njobs >= max_batches:
                    break
        if len(submitted_urls) > 0:
            batch_payload = {
                'inventory_files': submitted_urls,
                'keys': keys,
                'process': process
            }
            batch_payload.update(payload)
            submit_batch_job(batch_payload,
                             context.invoked_function_arn,
                             definition='lambda-as-batch',
                             name='feed-s3-inventory')
            njobs += 1
        logger.info(f"Submitted {njobs} batch jobs")
        return njobs

    # process inventory files (assumes this is batch!)
    inventory_files = payload.pop('inventory_files', None)
    keys = payload.pop('keys', None)
    base_url = payload.pop('base_url', None)

    # these are all required
    catids = []
    if inventory_files and keys and process:
        # filter filenames
        logger.info(f"Parsing {len(inventory_files)} inventory files")
        for f in inventory_files:
            for url in read_inventory_file(f, keys, **payload):
                parts = s3session.urlparse(url)
                id = '-'.join(op.dirname(parts['key']).split('/'))

                # use extension without . for asset key
                ext = op.splitext(parts['key'])[-1].lstrip('.')

                if base_url is not None and url.startswith('s3://'):
                    url = f"{base_url}/{parts['bucket']}/{parts['key']}"

                # TODO - determime input collection from url
                item = {
                    'type': 'Feature',
                    'id': id,
                    'collection': process['input_collections'][0],
                    'properties': {},
                    'assets': {
                        ext: {
                            'href': url
                        }
                    }
                }
                catalog = {
                    'type': 'FeatureCollection',
                    'features': [item],
                    'process': process
                }

                # feed to cirrus through SNS topic
                SNS_CLIENT.publish(TopicArn=SNS_TOPIC,
                                   Message=json.dumps(catalog))
                if (len(catids) % 1000) == 0:
                    logger.debug(
                        f"Published {len(catids)} catalogs to {SNS_TOPIC}: {json.dumps(catalog)}"
                    )

                catids.append(item['id'])

        logger.info(
            f"Published {len(catids)} catalogs from {len(inventory_files)} inventory files"
        )
        return catids