Exemplo n.º 1
0
async def __write_chunks_and_send_messages(batch_id, records,
                                           dynamodb_resource, sqs_client):
    async with trace("Writing/sending chunks for batch {}", batch_id):
        async with await batch_tasks_table.new_batch_writer(dynamodb_resource
                                                            ) as batch_writer:
            for index, record in enumerate(records):
                pending_task = {
                    "batchId": batch_id,
                    "index": index,
                    "request": record
                }
                validate_pending_task(pending_task)
                await batch_tasks_table.put_pending_batch_task(
                    pending_task, batch_writer)

        async with process_queue.new_batch_sender(sqs_client) as batch_sender:
            for chunk_index, chunk in enumchunks(records, CHUNK_SIZE):
                async with trace("Sending chunk {} of tasks for batch_id={}",
                                 chunk_index, batch_id):
                    chunk = {
                        "batchId":
                        batch_id,
                        "index":
                        chunk_index,
                        "records": [{
                            "index":
                            chunk_index * CHUNK_SIZE + record_index
                        } for record_index, record in enumerate(chunk)]
                    }
                    await batch_sender.send_message(
                        message={
                            "Id": str(uuid4()),
                            "MessageBody": json.dumps(chunk)
                        })
Exemplo n.º 2
0
async def __process(record, s3_resource, batch_writer):
    async with trace("Processing {}", json.dumps(record)):
        validate_pending_task(record)
        index = record["index"]
        batch_id = record["batchId"]
        request = record["request"]
        item_no = request["itemNo"]
        await items_table.put_item(
            {
                "itemNo": str(item_no),
                "updateTimestamp": now_epoch_millis()
            }, batch_writer)
        processed_task = {
            "batchId": batch_id,
            "index": index,
            "request": request,
            "response": {
                "success": True,
                "message": "Ok"
            }
        }
        validate_processed_task(processed_task)
        await work_bucket.write_task_result(batch_id, index, processed_task,
                                            s3_resource)
        await work_bucket.delete_pending_task(batch_id, index, s3_resource)
Exemplo n.º 3
0
async def put_batch_status(batch_id, record_count, dynamodb_resource):
    async with trace("Put batch status for batch_id={}", batch_id):
        table = await dynamodb_resource.Table(_TABLE_NAME)
        await table.put_item(Item={
            "batchId": batch_id,
            "taskCount": record_count
        })
Exemplo n.º 4
0
async def __gather(record, s3_resource, s3_client):
    batch_id = record["batchId"]
    record_gather_started(batch_id)
    async with trace("Gathering results for batch batch_id={}", batch_id):
        status = await work_bucket.read_batch_status(batch_id, s3_resource)
        chunk_count = ceil(status["taskCount"] / status["chunkSize"])

        bufstream = BytesBufferIO()
        jsonstream = JsonStream(
            fp=io.TextIOWrapper(gzip.GzipFile(fileobj=bufstream, mode='wb'),
                                write_through=True,
                                encoding='utf-8'))
        jsonstream.start_object()
        for key, value in status.items():
            jsonstream.write_property(key, value)
        jsonstream.start_property("records")
        jsonstream.start_array()
        for _, chunks in enumchunks(range(chunk_count), 10):
            chunk_results = await asyncio.gather(*[
                work_bucket.read_chunk_result(batch_id, index, s3_resource)
                for index in chunks
            ])
            for chunk_result in chunk_results:
                for record in chunk_result["records"]:
                    jsonstream.write_value(record)
        jsonstream.end_array()
        jsonstream.end_object()
        jsonstream.close()
        json_bytes = bufstream.getvalue()
        await output_bucket.upload_batch_output(
            batch_id, gzip.GzipFile(fileobj=io.BytesIO(json_bytes), mode='rb'),
            s3_client)

        # await work_bucket.delete_batch_status(batch_id)
        record_batch_finished(batch_id)
Exemplo n.º 5
0
async def exists_pending_chunk(batch_id, s3_resource):
    async with trace("Checking if batch batch_id={} is complete", batch_id):
        s3_bucket = await s3_resource.Bucket(name=WORK_BUCKET)
        async for _ in s3_bucket.objects.filter(Prefix="{}/pending/".format(batch_id), MaxKeys=1):
            return True
        else:
            return False
Exemplo n.º 6
0
def write_pending_task(batch_id, index, request):
    object_key = "{}/pending/{}.json".format(batch_id, index)
    with trace("Write pending task {}/{} to s3", WORK_BUCKET, object_key):
        s3_resource.Object(WORK_BUCKET, object_key) \
            .put(ACL='private', Body=json.dumps({"batchId": batch_id,
                                                 "index": index,
                                                 "request": request}))
Exemplo n.º 7
0
def read_task_result(batch_id, index):
    object_key = "{}/done/{}.json".format(batch_id, index)
    with trace("Reading task result {}/{} to s3", WORK_BUCKET, object_key):
        s3_object = s3_resource.Object(WORK_BUCKET, object_key)
        data = s3_object.get()['Body'].read()
        json_doc = json.loads(data)
        return json_doc
Exemplo n.º 8
0
def read_batch_status(batch_id):
    with trace("Reading status for batch_id={}", batch_id):
        object_key = "{}/status.json".format(batch_id)
        s3_object = s3_resource.Object(WORK_BUCKET, object_key)
        data = s3_object.get()['Body'].read()
        json_doc = json.loads(data)
        return json_doc
Exemplo n.º 9
0
async def __write_tasks_and_send_messages(batch_id, records, s3_resource,
                                          sqs_client):
    async with trace("Writing/sending {} tasks for batch {}", len(records),
                     batch_id):
        async with S3BatchWriter(s3_resource=s3_resource,
                                 flush_amount=100) as batch_writer:
            for index, record in enumerate(records, start=0):
                pending_task = {
                    "batchId": batch_id,
                    "index": index,
                    "request": record
                }
                validate_pending_task(pending_task)
                await work_bucket.write_pending_task(batch_id, index,
                                                     pending_task,
                                                     batch_writer)

        async with process_queue.new_batch_sender(sqs_client) as batch_sender:
            for index, record in enumerate(records, start=0):
                pending_task = {
                    "batchId": batch_id,
                    "index": index,
                    "request": record
                }
                validate_pending_task(pending_task)
                await batch_sender.send_message(
                    message={
                        "Id": str(uuid4()),
                        "MessageBody": json.dumps(pending_task)
                    })
Exemplo n.º 10
0
async def delete_batch_input(bucket_name, object_key, s3_resource):
    if bucket_name != INPUT_BUCKET:
        raise ValueError("Expected bucket {}, but was {}.".format(
            INPUT_BUCKET, bucket_name))
    async with trace("Deleting {}/{} from s3", bucket_name, object_key):
        s3_object = await s3_resource.Object(bucket_name, object_key)
        await s3_object.delete()
Exemplo n.º 11
0
async def read_pending_chunk(s3_bucket, s3_object_key, s3_resource):
    async with trace("Reading pending chunk bucket={}/key={}", s3_bucket, s3_object_key):
        s3_object = await s3_resource.Object(s3_bucket, s3_object_key)
        response = await s3_object.get()
        async with response["Body"] as stream:
            data = await stream.read()
            json_doc = json.loads(data)
            return json_doc
Exemplo n.º 12
0
def send_batch_complete_message(batch_id):
    with trace("Sending complete message for {}", batch_id):
        queue_url = sqs_client.get_queue_url(
            QueueName=GATHER_QUEUE)["QueueUrl"]
        sqs_client.send_message(QueueUrl=queue_url,
                                MessageGroupId=batch_id,
                                MessageDeduplicationId=batch_id,
                                MessageBody=json.dumps({"batchId": batch_id}))
Exemplo n.º 13
0
async def write_pending_task(batch_id, index, pending_task, batch_writer):
    object_key = "{}/pending/{}.json".format(batch_id, index)
    async with trace("Write pending task {}/{} to s3", WORK_BUCKET,
                     object_key):
        await batch_writer.put(Bucket=WORK_BUCKET,
                               Key=object_key,
                               ACL='private',
                               Body=json.dumps(pending_task))
Exemplo n.º 14
0
async def send_batch_complete_message(batch_id, sqs_client):
    async with trace("Sending complete message for {}", batch_id):
        response = await sqs_client.get_queue_url(QueueName=GATHER_QUEUE)
        queue_url = response["QueueUrl"]
        await sqs_client.send_message(QueueUrl=queue_url,
                                      MessageGroupId=batch_id,
                                      MessageDeduplicationId=batch_id,
                                      MessageBody=json.dumps({"batchId": batch_id}))
Exemplo n.º 15
0
async def __write_chunks(batch_id, records, s3_resource, sqs_client):
    async with trace("Writing chunks for batch {}", batch_id):
        async with S3BatchWriter(s3_resource=s3_resource, flush_amount=CHUNK_SIZE) as batch_writer:
            for chunk_index, chunk in enumchunks(records, CHUNK_SIZE):
                chunk = {"batchId": batch_id, "index": chunk_index,
                         "records": [{"request": record, "index": chunk_index + record_index} for record_index, record
                                     in enumerate(chunk)]}
                validate_pending_chunk_of_tasks(chunk)
                await work_bucket.write_pending_chunk(batch_id, chunk_index, chunk, batch_writer)
Exemplo n.º 16
0
async def read_chunk_result(batch_id, index, s3_resource):
    async with trace("Reading chunk result batch_id={}/index={}", batch_id, index):
        object_key = "{}/done/{}.done.json".format(batch_id, index)
        s3_object = await s3_resource.Object(WORK_BUCKET, object_key)
        response = await s3_object.get()
        async with response["Body"] as stream:
            data = await stream.read()
            json_doc = json.loads(data)
            return json_doc
Exemplo n.º 17
0
async def read_batch_status(batch_id, s3_resource):
    async with trace("Reading status for batch_id={}", batch_id):
        object_key = "{}/status.json".format(batch_id)
        s3_object = await s3_resource.Object(WORK_BUCKET, object_key)
        response = await s3_object.get()
        async with response["Body"] as stream:
            data = await stream.read()
            json_doc = json.loads(data)
            return json_doc
Exemplo n.º 18
0
async def exist_pending_batch_tasks(batch_id, dynamodb_resource):
    async with trace("Checking if batch batch_id={} is complete", batch_id):
        table = await dynamodb_resource.Table(_TABLE_NAME)
        response = await table.query(
            IndexName=_PENDING_INDEX_NAME,
            Limit=1,
            ConsistentRead=True,
            KeyConditionExpression=Key("batchId").eq(batch_id))
        return len(response.get("Items", [])) > 0
Exemplo n.º 19
0
def write_batch_status(batch_id, record_count):
    with trace("Writing status for {}", batch_id):
        object_key = "{}/status.json".format(batch_id)
        s3_resource.Object(WORK_BUCKET, object_key).put(ACL='private', Body=json.dumps({
            "variant": "s3-sqs-lambda-sync",
            "batchId": batch_id,
            "taskCount": record_count,
            "startTime": now()
        }))
Exemplo n.º 20
0
def read_batch_input(bucket_name, object_key) -> dict:
    with trace("Reading input {}/{} from s3", bucket_name, object_key):
        if bucket_name != INPUT_BUCKET:
            raise ValueError("Expected bucket {}, but was {}.".format(
                INPUT_BUCKET, bucket_name))

        s3_object = s3_resource.Object(bucket_name, object_key)
        json_data = s3_object.get()['Body'].read()
        json_doc = json.loads(json_data)
        return json_doc
Exemplo n.º 21
0
async def read_batch_input(bucket_name, object_key, s3_resource) -> dict:
    async with trace("Reading input {}/{} from s3", bucket_name, object_key):
        if bucket_name != INPUT_BUCKET:
            raise ValueError("Expected bucket {}, but was {}.".format(INPUT_BUCKET, bucket_name))

        s3_object = await s3_resource.Object(bucket_name, object_key)
        response = await s3_object.get()
        async with response["Body"] as stream:
            data = await stream.read()
            json_doc = json.loads(data)
            return json_doc
Exemplo n.º 22
0
async def write_batch_status(batch_id, record_count, chunk_size, s3_resource):
    async with trace("Writing status for {}", batch_id):
        object_key = "{}/status.json".format(batch_id)
        s3_object = await s3_resource.Object(WORK_BUCKET, object_key)
        await s3_object.put(ACL='private', Body=json.dumps({
            "variant": "s3-notification-sqs-lambda",
            "batchId": batch_id,
            "chunkSize": chunk_size,
            "taskCount": record_count,
            "startTime": now()
        }))
Exemplo n.º 23
0
async def get_batch_task(batch_id, index, dynamodb_resource):
    async with trace("Get pending batch task for batch_id={},index={}",
                     batch_id, index):
        table = await dynamodb_resource.Table(_TABLE_NAME)
        response = await table.get_item(Key={
            "batchId": batch_id,
            "index": index
        })
        if not "Item" in response:
            raise ValueError(
                "Batch task batchId={}/index={} not found.".format(
                    batch_id, index))
        return dynamodb.clean(response["Item"])
Exemplo n.º 24
0
async def put_processed_batch_task(batch_id, index, request, response,
                                   dynamodb_resource):
    async with trace("Put processed batch task for batch_id={},index={}",
                     batch_id, index):
        table = await dynamodb_resource.Table(_TABLE_NAME)
        await table.put_item(
            Item={
                "batchId": batch_id,
                "index": index,
                "request": request,
                "response": response
                # NO isPending!!!
            })
Exemplo n.º 25
0
async def __process(message, s3_resource, batch_writer):
    async with trace("Processing {}", json.dumps(message)):
        batch_id = message["batchId"]
        index = message["index"]
        chunk = await work_bucket.read_pending_chunk(batch_id, index, s3_resource)
        for record in chunk["records"]:
            request = record["request"]
            item_no = request["itemNo"]
            record["response"] = {"success": True,
                                  "message": "Ok"}
        await items_table.put_item({"itemNo": str(item_no),
                                    "updateTimestamp": now_epoch_millis()},
                                   batch_writer)
        await work_bucket.write_chunk_result(batch_id, index, chunk, s3_resource)
        await work_bucket.delete_pending_chunk(batch_id, index, s3_resource)
Exemplo n.º 26
0
def handle_event(event, lambda_context):
    logger.info("Event: {}".format(json.dumps(event, indent=2)))
    s3_object = __get_s3_object_from(event)
    if s3_object is None:
        return
    batch_id = __extract_batch_id(s3_object[1])
    record_batch_started(batch_id)
    with trace("Scattering {}", batch_id):
        batch_doc = input_bucket.read_batch_input(s3_object[0], s3_object[1])
        validate_input(batch_doc)
        records = batch_doc.get("records", [])
        work_bucket.write_batch_status(batch_id, len(records))
        __write_tasks_and_send_messages(batch_id, records)

    input_bucket.delete_batch_input(s3_object[0], s3_object[1])
    record_scatter_finished(batch_id, len(records))
Exemplo n.º 27
0
def __write_tasks_and_send_messages(batch_id, records):
    with trace("Writing/sending {} tasks for batch {}", len(records),
               batch_id):
        with process_queue.new_batch_sender() as batch_sender:
            for index, record in enumerate(records, start=0):
                work_bucket.write_pending_task(batch_id, index, record)
                batch_sender.send_message(
                    message={
                        "Id":
                        str(uuid4()),
                        "MessageBody":
                        json.dumps({
                            "batchId": batch_id,
                            "index": index,
                            "request": record
                        })
                    })
Exemplo n.º 28
0
async def __gather(record, s3_resource):
    batch_id = record["batchId"]
    record_gather_started(batch_id)
    async with trace("Gathering results for batch batch_id={}", batch_id):
        status = await work_bucket.read_batch_status(batch_id, s3_resource)

        results = await __read_task_results(batch_id, status["taskCount"],
                                            s3_resource)

        status["endTime"] = now()
        status["results"] = results
        batch_output = {"records": results}
        await output_bucket.write_batch_output(batch_id, batch_output,
                                               s3_resource)

    # await work_bucket.delete_batch_status(batch_id)
    record_batch_finished(batch_id)
Exemplo n.º 29
0
async def __write_chunks_and_send_messages(batch_id, records, s3_resource,
                                           sqs_client):
    async with trace("Writing/sending chunks for batch {}", batch_id):
        async with S3BatchWriter(s3_resource=s3_resource,
                                 flush_amount=CHUNK_SIZE) as batch_writer:
            for chunk_index, chunk in enumchunks(records, CHUNK_SIZE):
                chunk = {
                    "batchId":
                    batch_id,
                    "index":
                    chunk_index,
                    "records": [{
                        "request": record,
                        "index": chunk_index + record_index
                    } for record_index, record in enumerate(chunk)]
                }
                validate_pending_chunk_of_tasks(chunk)
                await work_bucket.write_pending_chunk(batch_id, chunk_index,
                                                      chunk, batch_writer)

        async with process_queue.new_batch_sender(sqs_client) as batch_sender:
            for chunk_index, chunk in enumchunks(records, CHUNK_SIZE):
                chunk = {
                    "batchId":
                    batch_id,
                    "index":
                    chunk_index,
                    "records": [{
                        "request":
                        record,
                        "index":
                        chunk_index * CHUNK_SIZE + record_index
                    } for record_index, record in enumerate(chunk)]
                }
                validate_pending_chunk_of_tasks(chunk)
                await batch_sender.send_message(
                    message={
                        "Id":
                        str(uuid4()),
                        "MessageBody":
                        json.dumps({
                            "batchId": batch_id,
                            "index": chunk_index
                        })
                    })
Exemplo n.º 30
0
async def handle_event(event, lambda_context):
    logger.info("Event: {}".format(json.dumps(event, indent=2)))

    s3_object = __get_s3_object_from(event)
    if s3_object is None:
        logger.info("Is s3 test event. Skipping.")
        return

    batch_id = __extract_batch_id(s3_object[1])
    async with trace("Scattering {}", batch_id):
        async with aioaws.resource("s3") as s3_resource, aioaws.client("sqs") as sqs_client:
            batch_doc = await input_bucket.read_batch_input(s3_object[0], s3_object[1], s3_resource)
            validate_input(batch_doc)
            records = batch_doc.get("records", [])
            record_batch_started(batch_id)
            await work_bucket.write_batch_status(batch_id, len(records), CHUNK_SIZE, s3_resource)
            await __write_chunks(batch_id, records, s3_resource, sqs_client)
            await input_bucket.delete_batch_input(s3_object[0], s3_object[1], s3_resource)
    record_scatter_finished(batch_id, len(records))