async def __write_chunks_and_send_messages(batch_id, records, dynamodb_resource, sqs_client): async with trace("Writing/sending chunks for batch {}", batch_id): async with await batch_tasks_table.new_batch_writer(dynamodb_resource ) as batch_writer: for index, record in enumerate(records): pending_task = { "batchId": batch_id, "index": index, "request": record } validate_pending_task(pending_task) await batch_tasks_table.put_pending_batch_task( pending_task, batch_writer) async with process_queue.new_batch_sender(sqs_client) as batch_sender: for chunk_index, chunk in enumchunks(records, CHUNK_SIZE): async with trace("Sending chunk {} of tasks for batch_id={}", chunk_index, batch_id): chunk = { "batchId": batch_id, "index": chunk_index, "records": [{ "index": chunk_index * CHUNK_SIZE + record_index } for record_index, record in enumerate(chunk)] } await batch_sender.send_message( message={ "Id": str(uuid4()), "MessageBody": json.dumps(chunk) })
async def __process(record, s3_resource, batch_writer): async with trace("Processing {}", json.dumps(record)): validate_pending_task(record) index = record["index"] batch_id = record["batchId"] request = record["request"] item_no = request["itemNo"] await items_table.put_item( { "itemNo": str(item_no), "updateTimestamp": now_epoch_millis() }, batch_writer) processed_task = { "batchId": batch_id, "index": index, "request": request, "response": { "success": True, "message": "Ok" } } validate_processed_task(processed_task) await work_bucket.write_task_result(batch_id, index, processed_task, s3_resource) await work_bucket.delete_pending_task(batch_id, index, s3_resource)
async def put_batch_status(batch_id, record_count, dynamodb_resource): async with trace("Put batch status for batch_id={}", batch_id): table = await dynamodb_resource.Table(_TABLE_NAME) await table.put_item(Item={ "batchId": batch_id, "taskCount": record_count })
async def __gather(record, s3_resource, s3_client): batch_id = record["batchId"] record_gather_started(batch_id) async with trace("Gathering results for batch batch_id={}", batch_id): status = await work_bucket.read_batch_status(batch_id, s3_resource) chunk_count = ceil(status["taskCount"] / status["chunkSize"]) bufstream = BytesBufferIO() jsonstream = JsonStream( fp=io.TextIOWrapper(gzip.GzipFile(fileobj=bufstream, mode='wb'), write_through=True, encoding='utf-8')) jsonstream.start_object() for key, value in status.items(): jsonstream.write_property(key, value) jsonstream.start_property("records") jsonstream.start_array() for _, chunks in enumchunks(range(chunk_count), 10): chunk_results = await asyncio.gather(*[ work_bucket.read_chunk_result(batch_id, index, s3_resource) for index in chunks ]) for chunk_result in chunk_results: for record in chunk_result["records"]: jsonstream.write_value(record) jsonstream.end_array() jsonstream.end_object() jsonstream.close() json_bytes = bufstream.getvalue() await output_bucket.upload_batch_output( batch_id, gzip.GzipFile(fileobj=io.BytesIO(json_bytes), mode='rb'), s3_client) # await work_bucket.delete_batch_status(batch_id) record_batch_finished(batch_id)
async def exists_pending_chunk(batch_id, s3_resource): async with trace("Checking if batch batch_id={} is complete", batch_id): s3_bucket = await s3_resource.Bucket(name=WORK_BUCKET) async for _ in s3_bucket.objects.filter(Prefix="{}/pending/".format(batch_id), MaxKeys=1): return True else: return False
def write_pending_task(batch_id, index, request): object_key = "{}/pending/{}.json".format(batch_id, index) with trace("Write pending task {}/{} to s3", WORK_BUCKET, object_key): s3_resource.Object(WORK_BUCKET, object_key) \ .put(ACL='private', Body=json.dumps({"batchId": batch_id, "index": index, "request": request}))
def read_task_result(batch_id, index): object_key = "{}/done/{}.json".format(batch_id, index) with trace("Reading task result {}/{} to s3", WORK_BUCKET, object_key): s3_object = s3_resource.Object(WORK_BUCKET, object_key) data = s3_object.get()['Body'].read() json_doc = json.loads(data) return json_doc
def read_batch_status(batch_id): with trace("Reading status for batch_id={}", batch_id): object_key = "{}/status.json".format(batch_id) s3_object = s3_resource.Object(WORK_BUCKET, object_key) data = s3_object.get()['Body'].read() json_doc = json.loads(data) return json_doc
async def __write_tasks_and_send_messages(batch_id, records, s3_resource, sqs_client): async with trace("Writing/sending {} tasks for batch {}", len(records), batch_id): async with S3BatchWriter(s3_resource=s3_resource, flush_amount=100) as batch_writer: for index, record in enumerate(records, start=0): pending_task = { "batchId": batch_id, "index": index, "request": record } validate_pending_task(pending_task) await work_bucket.write_pending_task(batch_id, index, pending_task, batch_writer) async with process_queue.new_batch_sender(sqs_client) as batch_sender: for index, record in enumerate(records, start=0): pending_task = { "batchId": batch_id, "index": index, "request": record } validate_pending_task(pending_task) await batch_sender.send_message( message={ "Id": str(uuid4()), "MessageBody": json.dumps(pending_task) })
async def delete_batch_input(bucket_name, object_key, s3_resource): if bucket_name != INPUT_BUCKET: raise ValueError("Expected bucket {}, but was {}.".format( INPUT_BUCKET, bucket_name)) async with trace("Deleting {}/{} from s3", bucket_name, object_key): s3_object = await s3_resource.Object(bucket_name, object_key) await s3_object.delete()
async def read_pending_chunk(s3_bucket, s3_object_key, s3_resource): async with trace("Reading pending chunk bucket={}/key={}", s3_bucket, s3_object_key): s3_object = await s3_resource.Object(s3_bucket, s3_object_key) response = await s3_object.get() async with response["Body"] as stream: data = await stream.read() json_doc = json.loads(data) return json_doc
def send_batch_complete_message(batch_id): with trace("Sending complete message for {}", batch_id): queue_url = sqs_client.get_queue_url( QueueName=GATHER_QUEUE)["QueueUrl"] sqs_client.send_message(QueueUrl=queue_url, MessageGroupId=batch_id, MessageDeduplicationId=batch_id, MessageBody=json.dumps({"batchId": batch_id}))
async def write_pending_task(batch_id, index, pending_task, batch_writer): object_key = "{}/pending/{}.json".format(batch_id, index) async with trace("Write pending task {}/{} to s3", WORK_BUCKET, object_key): await batch_writer.put(Bucket=WORK_BUCKET, Key=object_key, ACL='private', Body=json.dumps(pending_task))
async def send_batch_complete_message(batch_id, sqs_client): async with trace("Sending complete message for {}", batch_id): response = await sqs_client.get_queue_url(QueueName=GATHER_QUEUE) queue_url = response["QueueUrl"] await sqs_client.send_message(QueueUrl=queue_url, MessageGroupId=batch_id, MessageDeduplicationId=batch_id, MessageBody=json.dumps({"batchId": batch_id}))
async def __write_chunks(batch_id, records, s3_resource, sqs_client): async with trace("Writing chunks for batch {}", batch_id): async with S3BatchWriter(s3_resource=s3_resource, flush_amount=CHUNK_SIZE) as batch_writer: for chunk_index, chunk in enumchunks(records, CHUNK_SIZE): chunk = {"batchId": batch_id, "index": chunk_index, "records": [{"request": record, "index": chunk_index + record_index} for record_index, record in enumerate(chunk)]} validate_pending_chunk_of_tasks(chunk) await work_bucket.write_pending_chunk(batch_id, chunk_index, chunk, batch_writer)
async def read_chunk_result(batch_id, index, s3_resource): async with trace("Reading chunk result batch_id={}/index={}", batch_id, index): object_key = "{}/done/{}.done.json".format(batch_id, index) s3_object = await s3_resource.Object(WORK_BUCKET, object_key) response = await s3_object.get() async with response["Body"] as stream: data = await stream.read() json_doc = json.loads(data) return json_doc
async def read_batch_status(batch_id, s3_resource): async with trace("Reading status for batch_id={}", batch_id): object_key = "{}/status.json".format(batch_id) s3_object = await s3_resource.Object(WORK_BUCKET, object_key) response = await s3_object.get() async with response["Body"] as stream: data = await stream.read() json_doc = json.loads(data) return json_doc
async def exist_pending_batch_tasks(batch_id, dynamodb_resource): async with trace("Checking if batch batch_id={} is complete", batch_id): table = await dynamodb_resource.Table(_TABLE_NAME) response = await table.query( IndexName=_PENDING_INDEX_NAME, Limit=1, ConsistentRead=True, KeyConditionExpression=Key("batchId").eq(batch_id)) return len(response.get("Items", [])) > 0
def write_batch_status(batch_id, record_count): with trace("Writing status for {}", batch_id): object_key = "{}/status.json".format(batch_id) s3_resource.Object(WORK_BUCKET, object_key).put(ACL='private', Body=json.dumps({ "variant": "s3-sqs-lambda-sync", "batchId": batch_id, "taskCount": record_count, "startTime": now() }))
def read_batch_input(bucket_name, object_key) -> dict: with trace("Reading input {}/{} from s3", bucket_name, object_key): if bucket_name != INPUT_BUCKET: raise ValueError("Expected bucket {}, but was {}.".format( INPUT_BUCKET, bucket_name)) s3_object = s3_resource.Object(bucket_name, object_key) json_data = s3_object.get()['Body'].read() json_doc = json.loads(json_data) return json_doc
async def read_batch_input(bucket_name, object_key, s3_resource) -> dict: async with trace("Reading input {}/{} from s3", bucket_name, object_key): if bucket_name != INPUT_BUCKET: raise ValueError("Expected bucket {}, but was {}.".format(INPUT_BUCKET, bucket_name)) s3_object = await s3_resource.Object(bucket_name, object_key) response = await s3_object.get() async with response["Body"] as stream: data = await stream.read() json_doc = json.loads(data) return json_doc
async def write_batch_status(batch_id, record_count, chunk_size, s3_resource): async with trace("Writing status for {}", batch_id): object_key = "{}/status.json".format(batch_id) s3_object = await s3_resource.Object(WORK_BUCKET, object_key) await s3_object.put(ACL='private', Body=json.dumps({ "variant": "s3-notification-sqs-lambda", "batchId": batch_id, "chunkSize": chunk_size, "taskCount": record_count, "startTime": now() }))
async def get_batch_task(batch_id, index, dynamodb_resource): async with trace("Get pending batch task for batch_id={},index={}", batch_id, index): table = await dynamodb_resource.Table(_TABLE_NAME) response = await table.get_item(Key={ "batchId": batch_id, "index": index }) if not "Item" in response: raise ValueError( "Batch task batchId={}/index={} not found.".format( batch_id, index)) return dynamodb.clean(response["Item"])
async def put_processed_batch_task(batch_id, index, request, response, dynamodb_resource): async with trace("Put processed batch task for batch_id={},index={}", batch_id, index): table = await dynamodb_resource.Table(_TABLE_NAME) await table.put_item( Item={ "batchId": batch_id, "index": index, "request": request, "response": response # NO isPending!!! })
async def __process(message, s3_resource, batch_writer): async with trace("Processing {}", json.dumps(message)): batch_id = message["batchId"] index = message["index"] chunk = await work_bucket.read_pending_chunk(batch_id, index, s3_resource) for record in chunk["records"]: request = record["request"] item_no = request["itemNo"] record["response"] = {"success": True, "message": "Ok"} await items_table.put_item({"itemNo": str(item_no), "updateTimestamp": now_epoch_millis()}, batch_writer) await work_bucket.write_chunk_result(batch_id, index, chunk, s3_resource) await work_bucket.delete_pending_chunk(batch_id, index, s3_resource)
def handle_event(event, lambda_context): logger.info("Event: {}".format(json.dumps(event, indent=2))) s3_object = __get_s3_object_from(event) if s3_object is None: return batch_id = __extract_batch_id(s3_object[1]) record_batch_started(batch_id) with trace("Scattering {}", batch_id): batch_doc = input_bucket.read_batch_input(s3_object[0], s3_object[1]) validate_input(batch_doc) records = batch_doc.get("records", []) work_bucket.write_batch_status(batch_id, len(records)) __write_tasks_and_send_messages(batch_id, records) input_bucket.delete_batch_input(s3_object[0], s3_object[1]) record_scatter_finished(batch_id, len(records))
def __write_tasks_and_send_messages(batch_id, records): with trace("Writing/sending {} tasks for batch {}", len(records), batch_id): with process_queue.new_batch_sender() as batch_sender: for index, record in enumerate(records, start=0): work_bucket.write_pending_task(batch_id, index, record) batch_sender.send_message( message={ "Id": str(uuid4()), "MessageBody": json.dumps({ "batchId": batch_id, "index": index, "request": record }) })
async def __gather(record, s3_resource): batch_id = record["batchId"] record_gather_started(batch_id) async with trace("Gathering results for batch batch_id={}", batch_id): status = await work_bucket.read_batch_status(batch_id, s3_resource) results = await __read_task_results(batch_id, status["taskCount"], s3_resource) status["endTime"] = now() status["results"] = results batch_output = {"records": results} await output_bucket.write_batch_output(batch_id, batch_output, s3_resource) # await work_bucket.delete_batch_status(batch_id) record_batch_finished(batch_id)
async def __write_chunks_and_send_messages(batch_id, records, s3_resource, sqs_client): async with trace("Writing/sending chunks for batch {}", batch_id): async with S3BatchWriter(s3_resource=s3_resource, flush_amount=CHUNK_SIZE) as batch_writer: for chunk_index, chunk in enumchunks(records, CHUNK_SIZE): chunk = { "batchId": batch_id, "index": chunk_index, "records": [{ "request": record, "index": chunk_index + record_index } for record_index, record in enumerate(chunk)] } validate_pending_chunk_of_tasks(chunk) await work_bucket.write_pending_chunk(batch_id, chunk_index, chunk, batch_writer) async with process_queue.new_batch_sender(sqs_client) as batch_sender: for chunk_index, chunk in enumchunks(records, CHUNK_SIZE): chunk = { "batchId": batch_id, "index": chunk_index, "records": [{ "request": record, "index": chunk_index * CHUNK_SIZE + record_index } for record_index, record in enumerate(chunk)] } validate_pending_chunk_of_tasks(chunk) await batch_sender.send_message( message={ "Id": str(uuid4()), "MessageBody": json.dumps({ "batchId": batch_id, "index": chunk_index }) })
async def handle_event(event, lambda_context): logger.info("Event: {}".format(json.dumps(event, indent=2))) s3_object = __get_s3_object_from(event) if s3_object is None: logger.info("Is s3 test event. Skipping.") return batch_id = __extract_batch_id(s3_object[1]) async with trace("Scattering {}", batch_id): async with aioaws.resource("s3") as s3_resource, aioaws.client("sqs") as sqs_client: batch_doc = await input_bucket.read_batch_input(s3_object[0], s3_object[1], s3_resource) validate_input(batch_doc) records = batch_doc.get("records", []) record_batch_started(batch_id) await work_bucket.write_batch_status(batch_id, len(records), CHUNK_SIZE, s3_resource) await __write_chunks(batch_id, records, s3_resource, sqs_client) await input_bucket.delete_batch_input(s3_object[0], s3_object[1], s3_resource) record_scatter_finished(batch_id, len(records))