def __init__(self, request_id: str): Logging.set_correlation_id(logger, request_id) self.request_id = request_id self._num_bundles = None self._format = None self.dynamo_handler = DynamoHandler() self.cloudwatch_handler = CloudwatchHandler() self.batch_handler = BatchHandler()
def __init__(self, request_id: str): Logging.set_correlation_id(logger, value=request_id) self.request_id = request_id self.request_tracker = RequestTracker(request_id) self.dynamo_handler = DynamoHandler() self.sqs_handler = SQSHandler() self.infra_config = MatrixInfraConfig() self.redshift_config = MatrixRedshiftConfig() self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET'] self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])
def __init__(self, args): self.args = args self.format = args.format self.request_tracker = RequestTracker(args.request_id) self.query_results = {} self.local_output_filename = os.path.basename(os.path.normpath(args.target_path)) self.target_path = args.target_path self.working_dir = args.working_dir self.FS = s3fs.S3FileSystem() Logging.set_correlation_id(LOGGER, value=args.request_id)
def run(self, max_loops=None): loops = 0 while max_loops is None or loops < max_loops: loops += 1 messages = self.sqs_handler.receive_messages_from_queue( self.query_job_q_url) if messages: message = messages[0] logger.info(f"Received {message} from {self.query_job_q_url}") payload = json.loads(message['Body']) request_id = payload['request_id'] request_tracker = RequestTracker(request_id) Logging.set_correlation_id(logger, value=request_id) obj_key = payload['s3_obj_key'] receipt_handle = message['ReceiptHandle'] try: logger.info(f"Fetching query from {obj_key}") query = self.s3_handler.load_content_from_obj_key(obj_key) logger.info(f"Running query from {obj_key}") self.redshift_handler.transaction([query], read_only=True) logger.info(f"Finished running query from {obj_key}") logger.info( f"Deleting {message} from {self.query_job_q_url}") self.sqs_handler.delete_message_from_queue( self.query_job_q_url, receipt_handle) logger.info( "Incrementing completed queries in state table") request_tracker.complete_subtask_execution(Subtask.QUERY) if request_tracker.is_request_ready_for_conversion(): logger.info("Scheduling batch conversion job") batch_job_id = self.batch_handler.schedule_matrix_conversion( request_id, request_tracker.format) request_tracker.write_batch_job_id_to_db(batch_job_id) except Exception as e: logger.info( f"QueryRunner failed on {message} with error {e}") request_tracker.log_error(str(e)) logger.info( f"Adding {message} to {self.query_job_deadletter_q_url}" ) self.sqs_handler.add_message_to_queue( self.query_job_deadletter_q_url, payload) logger.info( f"Deleting {message} from {self.query_job_q_url}") self.sqs_handler.delete_message_from_queue( self.query_job_q_url, receipt_handle) else: logger.info(f"No messages to read from {self.query_job_q_url}")
def __init__(self, request_id: str): Logging.set_correlation_id(logger, request_id) self.request_id = request_id self._request_hash = "N/A" self._data_version = None self._num_bundles = None self._format = None self._metadata_fields = None self._feature = None self.dynamo_handler = DynamoHandler() self.cloudwatch_handler = CloudwatchHandler() self.batch_handler = BatchHandler()
def schedule_matrix_conversion(self, request_id: str, format: str, s3_results_key: str): """ Schedule a matrix conversion job within aws batch infra :param request_id: UUID identifying a matrix service request. :param format: User requested output file format of final expression matrix. :param s3_results_key: S3 key where the matrix results will be written to. """ Logging.set_correlation_id(logger, value=request_id) job_name = "-".join(["conversion", self.deployment_stage, request_id, format]) source_expression_manifest = f"s3://{self.s3_query_results_bucket}/{request_id}/expression_manifest" source_cell_manifest = f"s3://{self.s3_query_results_bucket}/{request_id}/cell_metadata_manifest" source_gene_manifest = f"s3://{self.s3_query_results_bucket}/{request_id}/gene_metadata_manifest" target_path = f"s3://{self.s3_results_bucket}/{s3_results_key}" working_dir = f"/data/{request_id}" command = ['python3', '/matrix_converter.py', request_id, source_expression_manifest, source_cell_manifest, source_gene_manifest, target_path, format, working_dir] environment = { 'DEPLOYMENT_STAGE': self.deployment_stage, 'DYNAMO_DATA_VERSION_TABLE_NAME': DynamoTable.DATA_VERSION_TABLE.value, 'DYNAMO_DEPLOYMENT_TABLE_NAME': DynamoTable.DEPLOYMENT_TABLE.value, 'DYNAMO_REQUEST_TABLE_NAME': DynamoTable.REQUEST_TABLE.value } batch_job_id = self._enqueue_batch_job(job_name=job_name, job_queue_arn=self.job_queue_arn, job_def_arn=self.job_def_arn, command=command, environment=environment) self._cloudwatch_handler.put_metric_data( metric_name=MetricName.CONVERSION_REQUEST, metric_value=1 ) return batch_job_id
import os import boto3 from tenacity import retry, stop_after_attempt, wait_fixed from matrix.common.aws.dynamo_handler import DynamoTable from matrix.common.aws.cloudwatch_handler import CloudwatchHandler, MetricName from matrix.common.constants import MatrixFormat from matrix.common.logging import Logging logger = Logging.get_logger(__name__) class BatchHandler: def __init__(self): self.deployment_stage = os.environ['DEPLOYMENT_STAGE'] self.s3_results_bucket = os.environ.get('MATRIX_RESULTS_BUCKET') self.s3_query_results_bucket = os.environ.get( 'MATRIX_QUERY_RESULTS_BUCKET') self.job_queue_arn = os.environ.get('BATCH_CONVERTER_JOB_QUEUE_ARN') self.job_def_arn = os.environ.get('BATCH_CONVERTER_JOB_DEFINITION_ARN') self._cloudwatch_handler = CloudwatchHandler() self._client = boto3.client( "batch", region_name=os.environ['AWS_DEFAULT_REGION']) @retry(reraise=True, wait=wait_fixed(2), stop=stop_after_attempt(5)) def schedule_matrix_conversion(self, request_id: str, format: str): """ Schedule a matrix conversion job within aws batch infra :param request_id: UUID identifying a matrix service request.
import os import shutil import sys import zipfile import loompy import pandas import s3fs from matrix.common import constants from matrix.common import date from matrix.common.logging import Logging from matrix.common.constants import MatrixFormat from matrix.common.request.request_tracker import RequestTracker, Subtask LOGGER = Logging.get_logger(__file__) SUPPORTED_FORMATS = [item.value for item in MatrixFormat] class MatrixConverter: def __init__(self, args): self.args = args self.format = args.format self.request_tracker = RequestTracker(args.request_id) self.expression_manifest = None self.cell_manifest = None self.gene_manifest = None self.local_output_filename = os.path.basename( os.path.normpath(args.target_path)) self.target_path = args.target_path
import argparse import os import sys pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) # noqa sys.path.insert(0, pkg_root) # noqa from matrix.common.exceptions import MatrixException from matrix.common.aws.dynamo_handler import DynamoHandler, DynamoTable, DeploymentTableField from matrix.common.logging import Logging logger = Logging.get_logger(__file__) def set_data_version(version): """ Set a deployment's current data version in the Deployment table in DynamoDb. If the desired version does not exist in the Data Version table, the request will fail. """ dynamo_handler = DynamoHandler() deployment_stage = os.environ['DEPLOYMENT_STAGE'] try: dynamo_handler.get_table_item(table=DynamoTable.DATA_VERSION_TABLE, key=version) dynamo_handler.set_table_field_with_value( table=DynamoTable.DEPLOYMENT_TABLE, key=deployment_stage, field_enum=DeploymentTableField.CURRENT_DATA_VERSION, field_value=version)