def __init__(self, request_id: str):
        Logging.set_correlation_id(logger, request_id)

        self.request_id = request_id
        self._num_bundles = None
        self._format = None

        self.dynamo_handler = DynamoHandler()
        self.cloudwatch_handler = CloudwatchHandler()
        self.batch_handler = BatchHandler()
Exemplo n.º 2
0
    def __init__(self, request_id: str):
        Logging.set_correlation_id(logger, value=request_id)

        self.request_id = request_id
        self.request_tracker = RequestTracker(request_id)
        self.dynamo_handler = DynamoHandler()
        self.sqs_handler = SQSHandler()
        self.infra_config = MatrixInfraConfig()
        self.redshift_config = MatrixRedshiftConfig()
        self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET']
        self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])
Exemplo n.º 3
0
    def __init__(self, args):
        self.args = args
        self.format = args.format
        self.request_tracker = RequestTracker(args.request_id)
        self.query_results = {}

        self.local_output_filename = os.path.basename(os.path.normpath(args.target_path))
        self.target_path = args.target_path
        self.working_dir = args.working_dir
        self.FS = s3fs.S3FileSystem()

        Logging.set_correlation_id(LOGGER, value=args.request_id)
Exemplo n.º 4
0
    def run(self, max_loops=None):
        loops = 0
        while max_loops is None or loops < max_loops:
            loops += 1
            messages = self.sqs_handler.receive_messages_from_queue(
                self.query_job_q_url)
            if messages:
                message = messages[0]
                logger.info(f"Received {message} from {self.query_job_q_url}")
                payload = json.loads(message['Body'])
                request_id = payload['request_id']
                request_tracker = RequestTracker(request_id)
                Logging.set_correlation_id(logger, value=request_id)
                obj_key = payload['s3_obj_key']
                receipt_handle = message['ReceiptHandle']
                try:
                    logger.info(f"Fetching query from {obj_key}")
                    query = self.s3_handler.load_content_from_obj_key(obj_key)

                    logger.info(f"Running query from {obj_key}")
                    self.redshift_handler.transaction([query], read_only=True)
                    logger.info(f"Finished running query from {obj_key}")

                    logger.info(
                        f"Deleting {message} from {self.query_job_q_url}")
                    self.sqs_handler.delete_message_from_queue(
                        self.query_job_q_url, receipt_handle)

                    logger.info(
                        "Incrementing completed queries in state table")
                    request_tracker.complete_subtask_execution(Subtask.QUERY)

                    if request_tracker.is_request_ready_for_conversion():
                        logger.info("Scheduling batch conversion job")
                        batch_job_id = self.batch_handler.schedule_matrix_conversion(
                            request_id, request_tracker.format)
                        request_tracker.write_batch_job_id_to_db(batch_job_id)
                except Exception as e:
                    logger.info(
                        f"QueryRunner failed on {message} with error {e}")
                    request_tracker.log_error(str(e))
                    logger.info(
                        f"Adding {message} to {self.query_job_deadletter_q_url}"
                    )
                    self.sqs_handler.add_message_to_queue(
                        self.query_job_deadletter_q_url, payload)
                    logger.info(
                        f"Deleting {message} from {self.query_job_q_url}")
                    self.sqs_handler.delete_message_from_queue(
                        self.query_job_q_url, receipt_handle)
            else:
                logger.info(f"No messages to read from {self.query_job_q_url}")
    def __init__(self, request_id: str):
        Logging.set_correlation_id(logger, request_id)

        self.request_id = request_id
        self._request_hash = "N/A"
        self._data_version = None
        self._num_bundles = None
        self._format = None
        self._metadata_fields = None
        self._feature = None

        self.dynamo_handler = DynamoHandler()
        self.cloudwatch_handler = CloudwatchHandler()
        self.batch_handler = BatchHandler()
Exemplo n.º 6
0
    def schedule_matrix_conversion(self, request_id: str, format: str, s3_results_key: str):
        """
        Schedule a matrix conversion job within aws batch infra

        :param request_id: UUID identifying a matrix service request.
        :param format: User requested output file format of final expression matrix.
        :param s3_results_key: S3 key where the matrix results will be written to.
        """
        Logging.set_correlation_id(logger, value=request_id)
        job_name = "-".join(["conversion",
                             self.deployment_stage,
                             request_id,
                             format])

        source_expression_manifest = f"s3://{self.s3_query_results_bucket}/{request_id}/expression_manifest"
        source_cell_manifest = f"s3://{self.s3_query_results_bucket}/{request_id}/cell_metadata_manifest"
        source_gene_manifest = f"s3://{self.s3_query_results_bucket}/{request_id}/gene_metadata_manifest"
        target_path = f"s3://{self.s3_results_bucket}/{s3_results_key}"
        working_dir = f"/data/{request_id}"
        command = ['python3',
                   '/matrix_converter.py',
                   request_id,
                   source_expression_manifest,
                   source_cell_manifest,
                   source_gene_manifest,
                   target_path,
                   format,
                   working_dir]

        environment = {
            'DEPLOYMENT_STAGE': self.deployment_stage,
            'DYNAMO_DATA_VERSION_TABLE_NAME': DynamoTable.DATA_VERSION_TABLE.value,
            'DYNAMO_DEPLOYMENT_TABLE_NAME': DynamoTable.DEPLOYMENT_TABLE.value,
            'DYNAMO_REQUEST_TABLE_NAME': DynamoTable.REQUEST_TABLE.value
        }

        batch_job_id = self._enqueue_batch_job(job_name=job_name,
                                               job_queue_arn=self.job_queue_arn,
                                               job_def_arn=self.job_def_arn,
                                               command=command,
                                               environment=environment)
        self._cloudwatch_handler.put_metric_data(
            metric_name=MetricName.CONVERSION_REQUEST,
            metric_value=1
        )
        return batch_job_id
Exemplo n.º 7
0
import os

import boto3
from tenacity import retry, stop_after_attempt, wait_fixed

from matrix.common.aws.dynamo_handler import DynamoTable
from matrix.common.aws.cloudwatch_handler import CloudwatchHandler, MetricName
from matrix.common.constants import MatrixFormat
from matrix.common.logging import Logging

logger = Logging.get_logger(__name__)


class BatchHandler:
    def __init__(self):
        self.deployment_stage = os.environ['DEPLOYMENT_STAGE']
        self.s3_results_bucket = os.environ.get('MATRIX_RESULTS_BUCKET')
        self.s3_query_results_bucket = os.environ.get(
            'MATRIX_QUERY_RESULTS_BUCKET')
        self.job_queue_arn = os.environ.get('BATCH_CONVERTER_JOB_QUEUE_ARN')
        self.job_def_arn = os.environ.get('BATCH_CONVERTER_JOB_DEFINITION_ARN')
        self._cloudwatch_handler = CloudwatchHandler()
        self._client = boto3.client(
            "batch", region_name=os.environ['AWS_DEFAULT_REGION'])

    @retry(reraise=True, wait=wait_fixed(2), stop=stop_after_attempt(5))
    def schedule_matrix_conversion(self, request_id: str, format: str):
        """
        Schedule a matrix conversion job within aws batch infra

        :param request_id: UUID identifying a matrix service request.
Exemplo n.º 8
0
import os
import shutil
import sys
import zipfile

import loompy
import pandas
import s3fs

from matrix.common import constants
from matrix.common import date
from matrix.common.logging import Logging
from matrix.common.constants import MatrixFormat
from matrix.common.request.request_tracker import RequestTracker, Subtask

LOGGER = Logging.get_logger(__file__)
SUPPORTED_FORMATS = [item.value for item in MatrixFormat]


class MatrixConverter:
    def __init__(self, args):
        self.args = args
        self.format = args.format
        self.request_tracker = RequestTracker(args.request_id)
        self.expression_manifest = None
        self.cell_manifest = None
        self.gene_manifest = None

        self.local_output_filename = os.path.basename(
            os.path.normpath(args.target_path))
        self.target_path = args.target_path
import argparse
import os
import sys

pkg_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..",
                                        ".."))  # noqa
sys.path.insert(0, pkg_root)  # noqa

from matrix.common.exceptions import MatrixException
from matrix.common.aws.dynamo_handler import DynamoHandler, DynamoTable, DeploymentTableField
from matrix.common.logging import Logging

logger = Logging.get_logger(__file__)


def set_data_version(version):
    """
    Set a deployment's current data version in the Deployment table in DynamoDb.
    If the desired version does not exist in the Data Version table, the request will fail.
    """
    dynamo_handler = DynamoHandler()
    deployment_stage = os.environ['DEPLOYMENT_STAGE']

    try:
        dynamo_handler.get_table_item(table=DynamoTable.DATA_VERSION_TABLE,
                                      key=version)
        dynamo_handler.set_table_field_with_value(
            table=DynamoTable.DEPLOYMENT_TABLE,
            key=deployment_stage,
            field_enum=DeploymentTableField.CURRENT_DATA_VERSION,
            field_value=version)