예제 #1
0
    def __init__(self, request_id: str):
        Logging.set_correlation_id(logger, value=request_id)

        self.request_id = request_id
        self.request_tracker = RequestTracker(request_id)
        self.dynamo_handler = DynamoHandler()
        self.sqs_handler = SQSHandler()
        self.infra_config = MatrixInfraConfig()
        self.redshift_config = MatrixRedshiftConfig()
        self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET']
        self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])
예제 #2
0
    def __init__(self, args):
        self.args = args
        self.format = args.format
        self.request_tracker = RequestTracker(args.request_id)
        self.query_results = {}

        self.local_output_filename = os.path.basename(os.path.normpath(args.target_path))
        self.target_path = args.target_path
        self.working_dir = args.working_dir
        self.FS = s3fs.S3FileSystem()

        Logging.set_correlation_id(LOGGER, value=args.request_id)
예제 #3
0
def invalidate_cache_entries(request_ids: list = None,
                             request_hashes: list = None):
    """
    Invalidates a list of request IDs and/or request hashes.
    Invalidation refers to the invalidation of the request in DynamoDB
    and the deletion of the associated matrix in S3.

    Invalidated requests will return an `ERROR` state and explanation
    to the user via the GET endpoint.

    Request hashes are resolved to a list of associated request IDs.
    :param request_ids: list of request IDs to invalidate
    :param request_hashes: list of request hashes to invalidate
    """
    print(f"Invalidating request IDs: {request_ids}")
    print(f"Invalidating request hashes: {request_hashes}")
    deployment_stage = os.environ['DEPLOYMENT_STAGE']
    dynamo_handler = DynamoHandler()
    data_version = dynamo_handler.get_table_item(
        table=DynamoTable.DEPLOYMENT_TABLE,
        key=deployment_stage)[DeploymentTableField.CURRENT_DATA_VERSION.value]
    for request_hash in request_hashes:
        items = dynamo_handler.filter_table_items(
            table=DynamoTable.REQUEST_TABLE,
            attrs={
                RequestTableField.REQUEST_HASH.value: request_hash,
                RequestTableField.DATA_VERSION.value: data_version
            })
        for item in items:
            request_ids.append(item[RequestTableField.REQUEST_ID.value])

    s3_keys_to_delete = []
    for request_id in request_ids:
        print(f"Writing deletion error to {request_id} in DynamoDB.")
        request_tracker = RequestTracker(request_id=request_id)
        request_tracker.log_error(
            "This request has been deleted and is no longer available for download. "
            "Please generate a new matrix at POST /v1/matrix.")
        s3_keys_to_delete.append(request_tracker.s3_results_key)

    print(f"Deleting matrices at the following S3 keys: {s3_keys_to_delete}")
    s3_results_bucket_handler = S3Handler(os.environ['MATRIX_RESULTS_BUCKET'])
    deleted_objects = s3_results_bucket_handler.delete_objects(
        s3_keys_to_delete)

    deleted_keys = [
        deleted_object['Key'] for deleted_object in deleted_objects
    ]

    print(
        f"Successfully deleted the following matrices {deleted_keys}. ({len(deleted_keys)}/{len(s3_keys_to_delete)})"
    )
    def setUp(self, mock_get_datetime_now):
        super(TestRequestTracker, self).setUp()
        self.stub_date = '2019-03-18T180907.136216Z'
        mock_get_datetime_now.return_value = self.stub_date

        self.request_id = str(uuid.uuid4())
        self.request_tracker = RequestTracker(self.request_id)
        self.dynamo_handler = DynamoHandler()

        self.create_test_request_table()

        self.dynamo_handler.create_request_table_entry(self.request_id,
                                                       "test_format")
예제 #5
0
    def run(self, max_loops=None):
        loops = 0
        while max_loops is None or loops < max_loops:
            loops += 1
            messages = self.sqs_handler.receive_messages_from_queue(
                self.query_job_q_url)
            if messages:
                message = messages[0]
                logger.info(f"Received {message} from {self.query_job_q_url}")
                payload = json.loads(message['Body'])
                request_id = payload['request_id']
                request_tracker = RequestTracker(request_id)
                Logging.set_correlation_id(logger, value=request_id)
                obj_key = payload['s3_obj_key']
                receipt_handle = message['ReceiptHandle']
                try:
                    logger.info(f"Fetching query from {obj_key}")
                    query = self.s3_handler.load_content_from_obj_key(obj_key)

                    logger.info(f"Running query from {obj_key}")
                    self.redshift_handler.transaction([query], read_only=True)
                    logger.info(f"Finished running query from {obj_key}")

                    logger.info(
                        f"Deleting {message} from {self.query_job_q_url}")
                    self.sqs_handler.delete_message_from_queue(
                        self.query_job_q_url, receipt_handle)

                    logger.info(
                        "Incrementing completed queries in state table")
                    request_tracker.complete_subtask_execution(Subtask.QUERY)

                    if request_tracker.is_request_ready_for_conversion():
                        logger.info("Scheduling batch conversion job")
                        batch_job_id = self.batch_handler.schedule_matrix_conversion(
                            request_id, request_tracker.format)
                        request_tracker.write_batch_job_id_to_db(batch_job_id)
                except Exception as e:
                    logger.info(
                        f"QueryRunner failed on {message} with error {e}")
                    request_tracker.log_error(str(e))
                    logger.info(
                        f"Adding {message} to {self.query_job_deadletter_q_url}"
                    )
                    self.sqs_handler.add_message_to_queue(
                        self.query_job_deadletter_q_url, payload)
                    logger.info(
                        f"Deleting {message} from {self.query_job_q_url}")
                    self.sqs_handler.delete_message_from_queue(
                        self.query_job_q_url, receipt_handle)
            else:
                logger.info(f"No messages to read from {self.query_job_q_url}")
예제 #6
0
    def setUp(self, mock_get_datetime_now):
        super(TestRequestTracker, self).setUp()
        self.stub_date = '2019-03-18T180907.136216Z'
        mock_get_datetime_now.return_value = self.stub_date

        self.request_id = str(uuid.uuid4())
        self.request_tracker = RequestTracker(self.request_id)
        self.dynamo_handler = DynamoHandler()

        self.create_test_data_version_table()
        self.create_test_deployment_table()
        self.create_test_request_table()
        self.create_s3_results_bucket()

        self.init_test_data_version_table()
        self.init_test_deployment_table()

        self.dynamo_handler.create_request_table_entry(
            self.request_id, "test_format", ["test_field_1", "test_field_2"],
            "test_feature")
예제 #7
0
def post_matrix(body: dict):

    feature = body.get("feature", constants.DEFAULT_FEATURE)
    fields = body.get("fields", constants.DEFAULT_FIELDS)
    format_ = body['format'] if 'format' in body else MatrixFormat.LOOM.value
    expected_formats = [mf.value for mf in MatrixFormat]

    # Validate input parameters
    if format_ not in expected_formats:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a valid `format`. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)
    if "filter" not in body:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a filter. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)

    if len(json.dumps(body["filter"])) > 128000:
        return ({
            'message':
            "The filter specification is too large. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.request_entity_too_large)

    request_id = str(uuid.uuid4())
    RequestTracker(request_id).initialize_request(format_, fields, feature)

    driver_payload = {
        'request_id': request_id,
        'filter': body["filter"],
        'fields': fields,
        'feature': feature
    }
    lambda_handler.invoke(LambdaName.DRIVER_V1, driver_payload)

    return ({
        'request_id': request_id,
        'status': MatrixRequestStatus.IN_PROGRESS.value,
        'matrix_url': "",
        'eta': "",
        'message': "Job started."
    }, requests.codes.accepted)
예제 #8
0
    def test_is_initialized(self):
        self.assertTrue(self.request_tracker.is_initialized)

        new_request_tracker = RequestTracker("test_uuid")
        self.assertFalse(new_request_tracker.is_initialized)
예제 #9
0
class TestRequestTracker(MatrixTestCaseUsingMockAWS):
    @mock.patch("matrix.common.date.get_datetime_now")
    def setUp(self, mock_get_datetime_now):
        super(TestRequestTracker, self).setUp()
        self.stub_date = '2019-03-18T180907.136216Z'
        mock_get_datetime_now.return_value = self.stub_date

        self.request_id = str(uuid.uuid4())
        self.request_tracker = RequestTracker(self.request_id)
        self.dynamo_handler = DynamoHandler()

        self.create_test_data_version_table()
        self.create_test_deployment_table()
        self.create_test_request_table()
        self.create_s3_results_bucket()

        self.init_test_data_version_table()
        self.init_test_deployment_table()

        self.dynamo_handler.create_request_table_entry(
            self.request_id, "test_format", ["test_field_1", "test_field_2"],
            "test_feature")

    def test_is_initialized(self):
        self.assertTrue(self.request_tracker.is_initialized)

        new_request_tracker = RequestTracker("test_uuid")
        self.assertFalse(new_request_tracker.is_initialized)

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.generate_request_hash"
    )
    def test_request_hash(self, mock_generate_request_hash):
        with self.subTest("Test skip generation in API deployments:"):
            os.environ['MATRIX_VERSION'] = "test_version"
            self.assertEqual(self.request_tracker.request_hash, "N/A")
            mock_generate_request_hash.assert_not_called()

            stored_request_hash = self.dynamo_handler.get_table_item(
                DynamoTable.REQUEST_TABLE,
                key=self.request_id)[RequestTableField.REQUEST_HASH.value]

            self.assertEqual(self.request_tracker._request_hash, "N/A")
            self.assertEqual(stored_request_hash, "N/A")

            del os.environ['MATRIX_VERSION']

        with self.subTest(
                "Test generation and storage in Dynamo on first access"):
            mock_generate_request_hash.return_value = "test_hash"
            self.assertEqual(self.request_tracker.request_hash, "test_hash")
            mock_generate_request_hash.assert_called_once()

            stored_request_hash = self.dynamo_handler.get_table_item(
                DynamoTable.REQUEST_TABLE,
                key=self.request_id)[RequestTableField.REQUEST_HASH.value]

            self.assertEqual(self.request_tracker._request_hash, "test_hash")
            self.assertEqual(stored_request_hash, "test_hash")

        with self.subTest("Test immediate retrieval on future accesses"):
            self.assertEqual(self.request_tracker.request_hash, "test_hash")
            mock_generate_request_hash.assert_called_once()

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.request_hash",
        new_callable=mock.PropertyMock)
    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.data_version",
        new_callable=mock.PropertyMock)
    def test_s3_results_prefix(self, mock_data_version, mock_request_hash):
        mock_data_version.return_value = "test_data_version"
        mock_request_hash.return_value = "test_request_hash"

        self.assertEqual(self.request_tracker.s3_results_prefix,
                         "test_data_version/test_request_hash")

    @mock.patch("matrix.common.request.request_tracker.RequestTracker.format",
                new_callable=mock.PropertyMock)
    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.request_hash",
        new_callable=mock.PropertyMock)
    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.data_version",
        new_callable=mock.PropertyMock)
    def test_s3_results_key(self, mock_data_version, mock_request_hash,
                            mock_format):
        mock_data_version.return_value = "test_data_version"
        mock_request_hash.return_value = "test_request_hash"
        mock_format.return_value = "loom"

        self.assertEqual(
            self.request_tracker.s3_results_key,
            f"test_data_version/test_request_hash/{self.request_id}.loom")

        mock_format.return_value = "csv"
        self.assertEqual(
            self.request_tracker.s3_results_key,
            f"test_data_version/test_request_hash/{self.request_id}.csv.zip")

        mock_format.return_value = "mtx"
        self.assertEqual(
            self.request_tracker.s3_results_key,
            f"test_data_version/test_request_hash/{self.request_id}.mtx.zip")

    @mock.patch("matrix.common.aws.dynamo_handler.DynamoHandler.get_table_item"
                )
    def test_data_version(self, mock_get_table_item):
        mock_get_table_item.return_value = {
            RequestTableField.DATA_VERSION.value: 0
        }

        with self.subTest("Test Dynamo read on first access"):
            self.assertEqual(self.request_tracker.data_version, 0)
            mock_get_table_item.assert_called_once()

        with self.subTest("Test cached access on successive reads"):
            self.assertEqual(self.request_tracker.data_version, 0)
            mock_get_table_item.assert_called_once()

    def test_format(self):
        self.assertEqual(self.request_tracker.format, "test_format")

    def test_metadata_fields(self):
        self.assertEqual(self.request_tracker.metadata_fields,
                         ["test_field_1", "test_field_2"])

    def test_feature(self):
        self.assertEqual(self.request_tracker.feature, "test_feature")

    def test_batch_job_id(self):
        self.assertEqual(self.request_tracker.batch_job_id, None)

        field_enum = RequestTableField.BATCH_JOB_ID
        self.dynamo_handler.set_table_field_with_value(
            DynamoTable.REQUEST_TABLE, self.request_id, field_enum, "123-123")
        self.assertEqual(self.request_tracker.batch_job_id, "123-123")

    @mock.patch(
        "matrix.common.aws.batch_handler.BatchHandler.get_batch_job_status")
    def test_batch_job_status(self, mock_get_job_status):
        mock_get_job_status.return_value = "FAILED"
        field_enum = RequestTableField.BATCH_JOB_ID
        self.dynamo_handler.set_table_field_with_value(
            DynamoTable.REQUEST_TABLE, self.request_id, field_enum, "123-123")

        self.assertEqual(self.request_tracker.batch_job_status, "FAILED")

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.num_bundles",
        new_callable=mock.PropertyMock)
    def test_num_bundles_interval(self, mock_num_bundles):
        mock_num_bundles.return_value = 0
        self.assertEqual(self.request_tracker.num_bundles_interval, "0-499")

        mock_num_bundles.return_value = 1
        self.assertEqual(self.request_tracker.num_bundles_interval, "0-499")

        mock_num_bundles.return_value = 500
        self.assertEqual(self.request_tracker.num_bundles_interval, "500-999")

        mock_num_bundles.return_value = 1234
        self.assertEqual(self.request_tracker.num_bundles_interval,
                         "1000-1499")

    def test_creation_date(self):
        self.assertEqual(self.request_tracker.creation_date, self.stub_date)

    @mock.patch(
        "matrix.common.aws.cloudwatch_handler.CloudwatchHandler.put_metric_data"
    )
    def test_error(self, mock_cw_put):
        self.assertEqual(self.request_tracker.error, "")

        self.request_tracker.log_error("test error")
        self.assertEqual(self.request_tracker.error, "test error")
        mock_cw_put.assert_called_once_with(
            metric_name=MetricName.REQUEST_ERROR, metric_value=1)

    @mock.patch(
        "matrix.common.aws.cloudwatch_handler.CloudwatchHandler.put_metric_data"
    )
    @mock.patch(
        "matrix.common.aws.dynamo_handler.DynamoHandler.create_request_table_entry"
    )
    def test_initialize_request(self, mock_create_request_table_entry,
                                mock_create_cw_metric):
        self.request_tracker.initialize_request("test_format")

        mock_create_request_table_entry.assert_called_once_with(
            self.request_id, "test_format", DEFAULT_FIELDS, DEFAULT_FEATURE)
        mock_create_cw_metric.assert_called_once()

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.metadata_fields",
        new_callable=mock.PropertyMock)
    @mock.patch(
        "matrix.common.query.cell_query_results_reader.CellQueryResultsReader.load_results"
    )
    @mock.patch(
        "matrix.common.query.query_results_reader.QueryResultsReader._parse_manifest"
    )
    def test_generate_request_hash(self, mock_parse_manifest,
                                   mock_load_results, mock_metadata_fields):
        mock_load_results.return_value = pandas.DataFrame(
            index=["test_cell_key_1", "test_cell_key_2"])
        mock_metadata_fields.return_value = ["test_field_1", "test_field_2"]

        h = hashlib.md5()
        h.update(self.request_tracker.feature.encode())
        h.update(self.request_tracker.format.encode())
        h.update("test_field_1".encode())
        h.update("test_field_2".encode())
        h.update("test_cell_key_1".encode())
        h.update("test_cell_key_2".encode())

        self.assertEqual(self.request_tracker.generate_request_hash(),
                         h.hexdigest())

    @mock.patch(
        "matrix.common.aws.dynamo_handler.DynamoHandler.increment_table_field")
    def test_expect_subtask_execution(self, mock_increment_table_field):
        self.request_tracker.expect_subtask_execution(Subtask.DRIVER)

        mock_increment_table_field.assert_called_once_with(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.EXPECTED_DRIVER_EXECUTIONS, 1)

    @mock.patch(
        "matrix.common.aws.dynamo_handler.DynamoHandler.increment_table_field")
    def test_complete_subtask_execution(self, mock_increment_table_field):
        self.request_tracker.complete_subtask_execution(Subtask.DRIVER)

        mock_increment_table_field.assert_called_once_with(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.COMPLETED_DRIVER_EXECUTIONS, 1)

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.s3_results_prefix",
        new_callable=mock.PropertyMock)
    def test_lookup_cached_result(self, mock_s3_results_prefix):
        mock_s3_results_prefix.return_value = "test_prefix"
        s3_handler = S3Handler(os.environ['MATRIX_RESULTS_BUCKET'])

        with self.subTest("Do not match in S3 'directories'"):
            s3_handler.store_content_in_s3("test_prefix", "test_content")
            self.assertEqual(self.request_tracker.lookup_cached_result(), "")

        with self.subTest("Successfully retrieve a result key"):
            s3_handler.store_content_in_s3("test_prefix/test_result_1",
                                           "test_content")
            s3_handler.store_content_in_s3("test_prefix/test_result_2",
                                           "test_content")
            self.assertEqual(self.request_tracker.lookup_cached_result(),
                             "test_prefix/test_result_1")

    def test_is_request_complete(self):
        self.assertFalse(self.request_tracker.is_request_complete())

        s3_handler = S3Handler(os.environ['MATRIX_RESULTS_BUCKET'])

        s3_handler.store_content_in_s3(
            f"{self.request_tracker.s3_results_key}/{self.request_id}.{self.request_tracker.format}",
            "")

        self.assertTrue(self.request_tracker.is_request_complete())

    def test_is_request_ready_for_conversion(self):
        self.assertFalse(
            self.request_tracker.is_request_ready_for_conversion())
        self.dynamo_handler.increment_table_field(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.COMPLETED_QUERY_EXECUTIONS, 3)
        self.assertTrue(self.request_tracker.is_request_ready_for_conversion())

    @mock.patch(
        "matrix.common.aws.cloudwatch_handler.CloudwatchHandler.put_metric_data"
    )
    def test_complete_request(self, mock_cw_put):
        duration = 1

        self.request_tracker.complete_request(duration)

        expected_calls = [
            mock.call(metric_name=MetricName.CONVERSION_COMPLETION,
                      metric_value=1),
            mock.call(metric_name=MetricName.REQUEST_COMPLETION,
                      metric_value=1),
            mock.call(metric_name=MetricName.DURATION,
                      metric_value=duration,
                      metric_dimensions=[
                          {
                              'Name': "Number of Bundles",
                              'Value': mock.ANY
                          },
                          {
                              'Name': "Output Format",
                              'Value': mock.ANY
                          },
                      ]),
            mock.call(metric_name=MetricName.DURATION,
                      metric_value=duration,
                      metric_dimensions=[
                          {
                              'Name': "Number of Bundles",
                              'Value': mock.ANY
                          },
                      ])
        ]
        mock_cw_put.assert_has_calls(expected_calls)

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.log_error")
    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.creation_date",
        new_callable=mock.PropertyMock)
    @mock.patch("matrix.common.aws.s3_handler.S3Handler.exists")
    def test_is_expired(self, mock_exists, mock_creation_date, mock_log_error):
        with self.subTest("Expired"):
            mock_exists.return_value = False
            mock_creation_date.return_value = date.to_string(
                date.get_datetime_now() - timedelta(days=30, minutes=1))

            self.assertTrue(self.request_tracker.is_expired)
            mock_log_error.assert_called_once()
            mock_log_error.reset_mock()

        with self.subTest(
                "Not expired. Matrix DNE but not past expiration date"):
            mock_exists.return_value = False
            mock_creation_date.return_value = date.to_string(
                date.get_datetime_now() - timedelta(days=29))

            self.assertFalse(self.request_tracker.is_expired)
            mock_log_error.assert_not_called()

        with self.subTest("Not expired. Matrix exists"):
            mock_exists.return_value = True

            self.assertFalse(self.request_tracker.is_expired)
            mock_log_error.assert_not_called()

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.log_error")
    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.creation_date",
        new_callable=mock.PropertyMock)
    def test_timeout(self, mock_creation_date, mock_log_error):
        # no timeout
        mock_creation_date.return_value = date.to_string(
            date.get_datetime_now() - timedelta(hours=35, minutes=59))
        self.assertFalse(self.request_tracker.timeout)

        # timeout
        mock_creation_date.return_value = date.to_string(
            date.get_datetime_now() - timedelta(hours=36, minutes=1))
        self.assertTrue(self.request_tracker.timeout)
        mock_log_error.assert_called_once()

    @mock.patch(
        "matrix.common.aws.dynamo_handler.DynamoHandler.set_table_field_with_value"
    )
    def test_write_batch_job_id_to_db(self, mock_set_table_field_with_value):
        self.request_tracker.write_batch_job_id_to_db("123-123")
        mock_set_table_field_with_value.assert_called_once_with(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.BATCH_JOB_ID, "123-123")
예제 #10
0
class MatrixConverter:
    def __init__(self, args):
        self.args = args
        self.format = args.format
        self.request_tracker = RequestTracker(args.request_id)
        self.expression_manifest = None
        self.cell_manifest = None
        self.gene_manifest = None

        self.local_output_filename = os.path.basename(
            os.path.normpath(args.target_path))
        self.target_path = args.target_path
        self.working_dir = args.working_dir
        self.FS = s3fs.S3FileSystem()

        Logging.set_correlation_id(LOGGER, value=args.request_id)

    def run(self):
        try:
            LOGGER.debug(
                f"Beginning matrix conversion run for {self.args.request_id}")
            self.expression_manifest = self._parse_manifest(
                self.args.expression_manifest_key)
            self.cell_manifest = self._parse_manifest(
                self.args.cell_metadata_manifest_key)
            self.gene_manifest = self._parse_manifest(
                self.args.gene_metadata_manifest_key)

            LOGGER.debug(f"Beginning conversion to {self.format}")
            local_converted_path = getattr(self, f"_to_{self.format}")()
            LOGGER.debug(f"Conversion to {self.format} completed")

            LOGGER.debug(f"Beginning upload to S3")
            self._upload_converted_matrix(local_converted_path,
                                          self.target_path)
            LOGGER.debug("Upload to S3 complete, job finished")

            os.remove(local_converted_path)

            self.request_tracker.complete_subtask_execution(Subtask.CONVERTER)
            self.request_tracker.complete_request(
                duration=(date.get_datetime_now() -
                          date.to_datetime(self.request_tracker.creation_date)
                          ).total_seconds())
        except Exception as e:
            LOGGER.info(
                f"Matrix Conversion failed on {self.args.request_id} with error {str(e)}"
            )
            self.request_tracker.log_error(str(e))
            raise e

    def _parse_manifest(self, manifest_key):
        """Parse a manifest file produced by a Redshift UNLOAD query.

        Args:
            manifest_key: S3 location of the manifest file.

        Returns:
            dict with three keys:
                "columns": the column headers for the tables
                "part_urls": full S3 urls for the files containing results from each
                    Redshift slice
                "record_count": total number of records returned by the query
        """
        manifest = json.load(self.FS.open(manifest_key))

        return {
            "columns": [e["name"] for e in manifest["schema"]["elements"]],
            "part_urls": [
                e["url"] for e in manifest["entries"]
                if e["meta"]["record_count"]
            ],
            "record_count":
            manifest["meta"]["record_count"]
        }

    def _n_slices(self):
        """Return the number of slices associated with this Redshift result.

        Redshift UNLOAD creates on object per "slice" of the cluster. We might want to
        iterate over that, so this get the count of them.
        """
        return len(self.cell_manifest["part_urls"])

    def _load_cell_table_slice(self, slice_idx):
        """Load the cell metadata table from a particular result slice.

        Args:
            slice_idx: Index of the slice to get cell metadata for

        Returns:
            dataframe of cell metadata. Index is "cellkey" and other columns are metadata
            fields.
        """

        cell_table_columns = self._map_columns(self.cell_manifest["columns"])
        cell_table_dtype = {c: "category" for c in cell_table_columns}
        cell_table_dtype["genes_detected"] = "uint32"
        cell_table_dtype["cellkey"] = "object"

        part_url = self.cell_manifest["part_urls"][slice_idx]
        df = pandas.read_csv(part_url,
                             sep='|',
                             header=None,
                             names=cell_table_columns,
                             dtype=cell_table_dtype,
                             true_values=["t"],
                             false_values=["f"],
                             index_col="cellkey")

        return df

    def _load_gene_table(self):
        """Load the gene metadata table.

        Returns:
            dataframe of gene metadata. Index is "featurekey"
        """

        gene_table_columns = self._map_columns(self.gene_manifest["columns"])

        dfs = []
        for part_url in self.gene_manifest["part_urls"]:
            df = pandas.read_csv(part_url,
                                 sep='|',
                                 header=None,
                                 names=gene_table_columns,
                                 true_values=["t"],
                                 false_values=["f"],
                                 index_col="featurekey")

            dfs.append(df)
        return pandas.concat(dfs)

    def _load_expression_table_slice(self, slice_idx, chunksize=1000000):
        """Load expression data from a slice, yielding the data by a fixed number
        of rows.

        Args:
            slice_idx: Index of the slice to get data for
            chunksize: Number of rows to yield at once

        Yields:
            dataframe of expression data
        """

        part_url = self.expression_manifest["part_urls"][slice_idx]
        expression_table_columns = ["cellkey", "featurekey", "exprvalue"]
        expression_dtype = {
            "cellkey": "object",
            "featurekey": "object",
            "exprvalue": "float32"
        }

        # Iterate over chunks of the remote file. We have to set a fixed set
        # number of rows to read, but we also want to make sure that all the
        # rows from a given cell are yielded with each chunk. So we are going
        # to keep track of the "remainder", rows from the end of a chunk for a
        # cell the spans a chunk boundary.
        remainder = None
        for chunk in pandas.read_csv(part_url,
                                     sep="|",
                                     names=expression_table_columns,
                                     dtype=expression_dtype,
                                     header=None,
                                     chunksize=chunksize):

            # If we have some rows from the previous chunk, prepend them to
            # this one
            if remainder is not None:
                adjusted_chunk = pandas.concat([remainder, chunk],
                                               axis=0,
                                               copy=False)
            else:
                adjusted_chunk = chunk

            # Now get the rows for the cell at the end of this chunk that spans
            # the boundary. Remove them from the chunk we yield, but keep them
            # in the remainder.
            last_cellkey = adjusted_chunk.tail(1).cellkey.values[0]
            remainder = adjusted_chunk.loc[adjusted_chunk['cellkey'] ==
                                           last_cellkey]
            adjusted_chunk = adjusted_chunk[
                adjusted_chunk.cellkey != last_cellkey]

            yield adjusted_chunk

        if remainder is not None:
            yield remainder

    def _map_columns(self, cols):
        return [
            constants.TABLE_COLUMN_TO_METADATA_FIELD[col]
            if col in constants.TABLE_COLUMN_TO_METADATA_FIELD else col
            for col in cols
        ]

    def _to_mtx(self):
        """Write a zip file with an mtx and two metadata tsvs from Redshift query
        manifests.

        Returns:
           output_path: Path to the zip file.
        """
        # Add zip to the output filename and create the directory where we will
        # write output files.
        if not self.local_output_filename.endswith(".zip"):
            self.local_output_filename += ".zip"
        results_dir = os.path.join(
            self.working_dir,
            os.path.splitext(self.local_output_filename)[0])
        os.makedirs(results_dir)

        # Load the gene metadata and write it out to a tsv
        gene_df = self._load_gene_table()
        gene_df.to_csv(os.path.join(results_dir, "genes.tsv.gz"),
                       index_label="featurekey",
                       sep="\t",
                       compression="gzip")
        cell_df = pandas.concat(
            [self._load_cell_table_slice(s) for s in range(self._n_slices())],
            copy=False)

        # To follow 10x conventions, features are rows and cells are columns
        n_rows = gene_df.shape[0]
        n_cols = cell_df.shape[0]
        n_nonzero = self.expression_manifest["record_count"]

        cellkeys = []

        with gzip.open(os.path.join(results_dir, "matrix.mtx.gz"),
                       "w",
                       compresslevel=4) as exp_f:
            # Write the mtx header
            exp_f.write(
                "%%MatrixMarket matrix coordinate real general\n".encode())
            exp_f.write(f"{n_rows} {n_cols} {n_nonzero}\n".encode())

            cell_count = 0
            for slice_idx in range(self._n_slices()):
                for chunk in self._load_expression_table_slice(slice_idx):

                    grouped = chunk.groupby("cellkey")
                    for cell_group in grouped:
                        single_cell_df = cell_group[1]
                        single_cell_coo = single_cell_df.pivot(
                            index="featurekey",
                            columns="cellkey",
                            values="exprvalue").reindex(
                                index=gene_df.index).to_sparse().to_coo()

                        for row, col, value in zip(single_cell_coo.row,
                                                   single_cell_coo.col,
                                                   single_cell_coo.data):
                            exp_f.write(
                                f"{row+1} {col+cell_count+1} {value}\n".encode(
                                ))
                        cell_count += 1

                        cellkeys.append(cell_group[0])

        cell_df = cell_df.reindex(index=cellkeys)
        cell_df.to_csv(os.path.join(results_dir, "cells.tsv.gz"),
                       sep='\t',
                       index_label="cellkey",
                       compression="gzip")

        # Create a zip file out of the three written files.
        zipf = zipfile.ZipFile(
            os.path.join(self.working_dir, self.local_output_filename), 'w')
        zipf.write(os.path.join(results_dir, "genes.tsv.gz"),
                   arcname=os.path.join(os.path.basename(results_dir),
                                        "genes.tsv.gz"))
        zipf.write(os.path.join(results_dir, "matrix.mtx.gz"),
                   arcname=os.path.join(os.path.basename(results_dir),
                                        "matrix.mtx.gz"))
        zipf.write(os.path.join(results_dir, "cells.tsv.gz"),
                   arcname=os.path.join(os.path.basename(results_dir),
                                        "cells.tsv.gz"))
        zipf.write("mtx_readme.md")
        zipf.close()

        shutil.rmtree(results_dir)

        return os.path.join(self.working_dir, self.local_output_filename)

    def _to_loom(self):
        """Write a loom file from Redshift query manifests.

        Returns:
           output_path: Path to the new loom file.
        """

        # Put loom on the output filename if it's not already there.
        if not self.local_output_filename.endswith(".zip"):
            self.local_output_filename += ".zip"

        loom_filename = self.local_output_filename.rstrip(".zip")

        # Read the row (gene) attributes and then set some conventional names
        gene_df = self._load_gene_table()
        gene_df["featurekey"] = gene_df.index
        row_attrs = gene_df.to_dict("series")
        # Not expected to be unique
        row_attrs["Gene"] = row_attrs.pop("featurename")
        row_attrs["Accession"] = row_attrs.pop("featurekey")
        for key, val in row_attrs.items():
            row_attrs[key] = val.values

        loom_parts = []
        loom_part_dir = os.path.join(self.working_dir, ".loom_parts")

        if os.path.exists(loom_part_dir):
            shutil.rmtree(loom_part_dir)

        os.makedirs(loom_part_dir)

        # Iterate over the "slices" produced by the redshift query
        for slice_idx in range(self._n_slices()):

            # Get the cell metadata for all the cells in this slice
            cell_df = self._load_cell_table_slice(slice_idx)

            # Iterate over fixed-size chunks of expression data from this
            # slice.
            chunk_idx = 0
            for chunk in self._load_expression_table_slice(slice_idx):
                print(f"Loading chunk {chunk_idx} from slice {slice_idx}")
                sparse_cell_dfs = []

                # Group the data by cellkey and iterate over each cell
                grouped = chunk.groupby("cellkey")
                for cell_group in grouped:
                    single_cell_df = cell_group[1]

                    # Reshape the dataframe so cellkey is a column and features
                    # are rows. Reindex so all dataframes have the same row
                    # order, and then sparsify because this is a very empty
                    # dataset usually.
                    sparse_cell_dfs.append(
                        single_cell_df.pivot(
                            index="featurekey",
                            columns="cellkey",
                            values="exprvalue").reindex(
                                index=row_attrs["Accession"]).to_sparse())

                # Concatenate the cell dataframes together. This is what we'll
                # write to disk.
                if not sparse_cell_dfs:
                    continue
                sparse_expression_matrix = pandas.concat(sparse_cell_dfs,
                                                         axis=1,
                                                         copy=True)

                # Get the cell metadata dataframe for just the cell in this
                # chunk
                chunk_cell_df = cell_df.reindex(
                    index=sparse_expression_matrix.columns)
                chunk_cell_df["cellkey"] = chunk_cell_df.index
                for col in chunk_cell_df.columns:
                    if chunk_cell_df[col].dtype.name == "category":
                        chunk_cell_df[col] = chunk_cell_df[col].astype(
                            "object")
                col_attrs = chunk_cell_df.to_dict("series")
                col_attrs["CellID"] = col_attrs.pop("cellkey")

                # Just a thing you have to do...
                for key, val in col_attrs.items():
                    col_attrs[key] = val.values

                # Write the data from this chunk to its own file.
                loom_part_path = os.path.join(
                    loom_part_dir, f"matrix.{slice_idx}.{chunk_idx}.loom")
                print(f"Writing to {loom_part_path}")
                loompy.create(loom_part_path,
                              sparse_expression_matrix.to_coo(), row_attrs,
                              col_attrs)
                loom_parts.append(loom_part_path)
                chunk_idx += 1

        # Using the loompy method, combine all the chunks together into a
        # single file.
        print(f"Parts complete. Writing to {loom_filename}")
        loompy.combine(loom_parts,
                       key="Accession",
                       output_file=os.path.join(self.working_dir,
                                                loom_filename))
        shutil.rmtree(loom_part_dir)

        zipf = zipfile.ZipFile(
            os.path.join(self.working_dir, self.local_output_filename), 'w')
        zipf.write(os.path.join(self.working_dir, loom_filename),
                   arcname=loom_filename)
        zipf.write("loom_readme.md")
        zipf.close()

        return os.path.join(self.working_dir, self.local_output_filename)

    def _to_csv(self):
        """Write a zip file with csvs from Redshift query manifests and readme.

        Returns:
           output_path: Path to the new zip file.
        """

        if not self.local_output_filename.endswith(".zip"):
            self.local_output_filename += ".zip"

        results_dir = os.path.join(
            self.working_dir,
            os.path.splitext(self.local_output_filename)[0])
        os.makedirs(results_dir)

        gene_df = self._load_gene_table()
        gene_df.to_csv(os.path.join(results_dir, "genes.csv"),
                       index_label="featurekey")

        cellkeys = []
        with open(os.path.join(results_dir, "expression.csv"), "w") as exp_f:
            # Write the CSV's header
            gene_index_string_list = [str(x) for x in gene_df.index.tolist()]
            exp_f.write(','.join(["cellkey"] + gene_index_string_list))
            exp_f.write('\n')

            for slice_idx in range(self._n_slices()):
                for chunk in self._load_expression_table_slice(slice_idx):
                    # Group the data by cellkey and iterate over each cell
                    grouped = chunk.groupby("cellkey")
                    for cell_group in grouped:
                        single_cell_df = cell_group[1]
                        single_cell_df.pivot(index="cellkey",
                                             columns="featurekey",
                                             values="exprvalue").reindex(
                                                 columns=gene_df.index).to_csv(
                                                     exp_f,
                                                     header=False,
                                                     na_rep='0')
                        cellkeys.append(cell_group[0])

        cell_df = pandas.concat(
            [self._load_cell_table_slice(s) for s in range(self._n_slices())],
            copy=False)
        cell_df = cell_df.reindex(index=cellkeys)
        cell_df.to_csv(os.path.join(results_dir, "cells.csv"),
                       index_label="cellkey")

        zipf = zipfile.ZipFile(
            os.path.join(self.working_dir, self.local_output_filename), 'w',
            zipfile.ZIP_DEFLATED)
        zipf.write(os.path.join(results_dir, "genes.csv"),
                   arcname=os.path.join(os.path.basename(results_dir),
                                        "genes.csv"))
        zipf.write(os.path.join(results_dir, "expression.csv"),
                   arcname=os.path.join(os.path.basename(results_dir),
                                        "expression.csv"))
        zipf.write(os.path.join(results_dir, "cells.csv"),
                   arcname=os.path.join(os.path.basename(results_dir),
                                        "cells.csv"))
        zipf.write("csv_readme.md")
        zipf.close()

        shutil.rmtree(results_dir)

        return os.path.join(self.working_dir, self.local_output_filename)

    def _upload_converted_matrix(self, local_path, remote_path):
        """
        Upload the converted matrix to S3.
        Parameters
        ----------
        local_path : str
            Path to the new, converted matrix file
        remote_path : str
            S3 path where the converted matrix will be uploaded
        """
        self.FS.put(local_path, remote_path)
예제 #11
0
class MatrixConverter:
    def __init__(self, args):
        self.args = args
        self.format = args.format
        self.request_tracker = RequestTracker(args.request_id)
        self.query_results = {}

        self.local_output_filename = os.path.basename(
            os.path.normpath(args.target_path))
        self.target_path = args.target_path
        self.working_dir = args.working_dir
        self.FS = s3fs.S3FileSystem()

        Logging.set_correlation_id(LOGGER, value=args.request_id)

    def run(self):
        try:
            LOGGER.debug(
                f"Beginning matrix conversion run for {self.args.request_id}")
            self.query_results = {
                QueryType.CELL:
                CellQueryResultsReader(self.args.cell_metadata_manifest_key),
                QueryType.EXPRESSION:
                ExpressionQueryResultsReader(
                    self.args.expression_manifest_key),
                QueryType.FEATURE:
                FeatureQueryResultsReader(self.args.gene_metadata_manifest_key)
            }

            if self.query_results[QueryType.CELL].is_empty:
                LOGGER.debug(
                    f"Short-circuiting conversion because there are no cells.")
                pathlib.Path(self.local_output_filename).touch()
                local_converted_path = self.local_output_filename
            else:
                LOGGER.debug(f"Beginning conversion to {self.format}")
                local_converted_path = getattr(self, f"_to_{self.format}")()
                LOGGER.debug(f"Conversion to {self.format} completed")

            LOGGER.debug(f"Beginning upload to S3")
            self._upload_converted_matrix(local_converted_path,
                                          self.target_path)
            LOGGER.debug("Upload to S3 complete, job finished")

            os.remove(local_converted_path)

            self.request_tracker.complete_subtask_execution(Subtask.CONVERTER)
            self.request_tracker.complete_request(
                duration=(date.get_datetime_now() -
                          date.to_datetime(self.request_tracker.creation_date)
                          ).total_seconds())
        except Exception as e:
            LOGGER.info(
                f"Matrix Conversion failed on {self.args.request_id} with error {str(e)}"
            )
            self.request_tracker.log_error(str(e))
            raise e

    def _n_slices(self):
        """Return the number of slices associated with this Redshift result.

        Redshift UNLOAD creates on object per "slice" of the cluster. We might want to
        iterate over that, so this get the count of them.
        """
        return len(self.query_results[QueryType.CELL].manifest["part_urls"])

    def _make_directory(self):
        if not self.local_output_filename.endswith(".zip"):
            self.local_output_filename += ".zip"
        results_dir = os.path.join(
            self.working_dir,
            os.path.splitext(self.local_output_filename)[0])
        os.makedirs(results_dir)
        return results_dir

    def _zip_up_matrix_output(self,
                              results_dir,
                              matrix_file_names,
                              compression=zipfile.ZIP_STORED):
        zipf = zipfile.ZipFile(
            os.path.join(self.working_dir, self.local_output_filename), 'w',
            compression)
        for filename in matrix_file_names:
            zipf.write(os.path.join(results_dir, filename),
                       arcname=os.path.join(os.path.basename(results_dir),
                                            filename))
        zipf.close()
        shutil.rmtree(results_dir)
        return os.path.join(self.working_dir, self.local_output_filename)

    def _write_out_gene_dataframe(self,
                                  results_dir,
                                  output_filename,
                                  compression=False):
        gene_df = self.query_results[QueryType.FEATURE].load_results()
        if compression:
            gene_df.to_csv(os.path.join(results_dir, output_filename),
                           index_label="featurekey",
                           sep="\t",
                           compression="gzip")
        else:
            gene_df.to_csv(os.path.join(results_dir, output_filename),
                           index_label="featurekey")
        return gene_df

    def _write_out_gene_dataframe_10x(self, results_dir, output_filename):
        gene_df = self.query_results[QueryType.FEATURE].load_results()

        # Insert 10x featuretype column according to 10x specifications
        # https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/output/matrices
        gene_df['featuretype_10x'] = [
            "Gene Expression" for i in range(gene_df.shape[0])
        ]
        # Set featuretype_10x as 3rd column (including index) per spec
        cols = gene_df.columns.tolist()
        cols = cols[:1] + cols[-1:] + cols[1:-1]
        gene_df = gene_df[cols]

        gene_df.to_csv(os.path.join(results_dir, output_filename),
                       index_label="featurekey",
                       header=False,
                       sep="\t",
                       compression="gzip")
        return gene_df

    def _write_out_cell_dataframe(self,
                                  results_dir,
                                  output_filename,
                                  cell_df,
                                  cellkeys,
                                  compression=False):
        cell_df = cell_df.reindex(index=cellkeys)
        if compression:
            cell_df.to_csv(os.path.join(results_dir, output_filename),
                           sep='\t',
                           index_label="cellkey",
                           compression="gzip")
        else:
            cell_df.to_csv(os.path.join(results_dir, output_filename),
                           index_label="cellkey")
        return cell_df

    def _write_out_barcode_dataframe(self, results_dir, output_filename,
                                     cell_df, cellkeys):
        cell_df = cell_df.reindex(index=cellkeys)
        barcode_df = pandas.DataFrame(columns=["barcode"],
                                      data=list(cell_df['barcode']))
        barcode_df.to_csv(os.path.join(results_dir, output_filename),
                          header=False,
                          index=False,
                          sep='\t',
                          compression="gzip")

        return barcode_df

    def _generate_expression_dfs(self, num_of_cells):
        """Create dataframes of expression data that is guaranteed to contain the complete set
        of expression data for each cell that appears in it.

        Args:
            num_of_cells (int): Data from at most this many cells will be included in the
                output dataframe.

        Yields:
            cells_df (pd.DataFrame): Dataframe of expression data. Columns are from the
                expression query, so cellkey, featurekey, exprvalue.
        """
        def _grouper(iterable, n):
            args = [iter(iterable)] * n
            return itertools.zip_longest(*args, fillvalue=None)

        for slice_idx in range(self._n_slices()):
            for chunk in self.query_results[QueryType.EXPRESSION].load_slice(
                    slice_idx):
                grouped = chunk.groupby("cellkey")
                for cell_group in _grouper(grouped, num_of_cells):
                    cells_df = pandas.concat((c[1] for c in cell_group if c),
                                             axis=0,
                                             copy=False)
                    yield cells_df

    def _to_mtx(self):
        """Write a zip file with an mtx and two metadata tsvs from Redshift query
        manifests.

        Returns:
           output_path: Path to the zip file.
        """
        results_dir = self._make_directory()
        gene_df = self._write_out_gene_dataframe_10x(results_dir,
                                                     "features.tsv.gz")
        cell_df = self.query_results[QueryType.CELL].load_results()

        # To follow 10x conventions, features are rows and cells are columns
        n_rows = gene_df.shape[0]
        n_cols = cell_df.shape[0]
        n_nonzero = self.query_results[
            QueryType.EXPRESSION].manifest["record_count"]

        cellkeys = []
        with gzip.open(os.path.join(results_dir, "matrix.mtx.gz"),
                       "w",
                       compresslevel=4) as exp_f:
            # Write the mtx header
            exp_f.write(
                "%%MatrixMarket matrix coordinate real general\n".encode())
            exp_f.write(f"{n_rows} {n_cols} {n_nonzero}\n".encode())

            cell_count = 0

            # Iterate over groups of 50 cells in the query expression result
            for cells_df in self._generate_expression_dfs(50):
                # Reshape the result so cells are columns and genes are rows
                pivoted = cells_df.pivot(index="featurekey",
                                         columns="cellkey",
                                         values="exprvalue").reindex(
                                             index=gene_df.index).fillna(0.0)

                # Convert the result to a COO sparse matrix so we can simply
                # iterate over the non-zero values are write them to the mtx
                # file.
                coo = pivoted.astype(pandas.SparseDtype(
                    float, fill_value=0.0)).sparse.to_coo()

                lines = []
                for row, col, value in zip(coo.row, coo.col, coo.data):
                    lines.append(f"{row + 1} {col + cell_count + 1} {value}\n")
                exp_f.write(''.join(lines).encode())

                cell_count += pivoted.shape[1]
                cellkeys.extend(pivoted.columns.to_list())

        self._write_out_gene_dataframe(results_dir,
                                       "genes.tsv.gz",
                                       compression=True)
        self._write_out_cell_dataframe(results_dir,
                                       "cells.tsv.gz",
                                       cell_df,
                                       cellkeys,
                                       compression=True)
        self._write_out_barcode_dataframe(results_dir, "barcodes.tsv.gz",
                                          cell_df, cellkeys)

        file_names = [
            "features.tsv.gz", "genes.tsv.gz", "matrix.mtx.gz", "cells.tsv.gz",
            "barcodes.tsv.gz"
        ]
        zip_path = self._zip_up_matrix_output(results_dir, file_names)
        return zip_path

    def _loom_timestamp(self):
        """Return a timestamp of the current time in the format specified in the loom spec.

        Note that this is slightly different than that format used elsewhere in the matrix
        service.
        """
        return datetime.datetime.utcnow().strftime("%Y%m%dT%H%M%S.%fZ")

    def _to_loom(self):
        """Write a loom file from Redshift query manifests.

        Returns:
           output_path: Path to the new loom file.
        """

        # Put loom on the output filename if it's not already there.
        if not self.local_output_filename.endswith(".loom"):
            self.local_output_filename += ".loom"

        # Read the row (gene) attributes and then set some conventional names
        gene_df = self.query_results[QueryType.FEATURE].load_results()
        gene_df["featurekey"] = gene_df.index

        gene_count = gene_df.shape[0]
        cell_count = self.query_results[
            QueryType.CELL].manifest["record_count"]

        os.makedirs(self.working_dir, exist_ok=True)

        loom_path = os.path.join(self.working_dir, self.local_output_filename)
        loom_file = h5py.File(loom_path, mode="w")

        # Set some file attributes defined in the loom spec
        loom_file.attrs["CreationDate"] = self._loom_timestamp()
        loom_file.attrs["LOOM_SPEC_VERSION"] = "2.0.1"

        # Create the hdf5 dataset that will hold all the expression data
        matrix_dataset = loom_file.create_dataset("matrix",
                                                  shape=(gene_count,
                                                         cell_count),
                                                  dtype="float32",
                                                  compression="gzip",
                                                  compression_opts=2,
                                                  chunks=(gene_count, 1))

        cellkeys = []
        cell_counter = 0

        # Iterate through the cells. For each set of cells reshape the
        # dataframe so genes are row and cells are columns. Stick that data
        # into the expression dataset.
        for cells_df in self._generate_expression_dfs(50):
            pivoted = cells_df.pivot(
                index="featurekey", columns="cellkey",
                values="exprvalue").reindex(index=gene_df.index).fillna(0.0)
            cellkeys.extend(pivoted.columns.to_list())
            matrix_dataset[:, cell_counter:cell_counter +
                           pivoted.shape[1]] = pivoted
            cell_counter += pivoted.shape[1]
        matrix_dataset.attrs["last_modified"] = self._loom_timestamp()

        # Now write the metadata into different datasets according to the loom
        # spec.
        cell_df = self.query_results[QueryType.CELL].load_results().reindex(
            index=cellkeys)
        col_attrs_group = loom_file.create_group("col_attrs")
        cell_id_dset = col_attrs_group.create_dataset(
            "CellID",
            data=loompy.normalize_attr_values(cell_df.index.to_numpy()),
            compression='gzip',
            compression_opts=2,
            chunks=(min(256, cell_count), ))
        cell_id_dset.attrs["last_modified"] = self._loom_timestamp()

        for cell_metadata_field in cell_df:
            cell_metadata = cell_df[cell_metadata_field]
            dset = col_attrs_group.create_dataset(
                cell_metadata_field,
                data=loompy.normalize_attr_values(cell_metadata.to_numpy()),
                compression='gzip',
                compression_opts=2,
                chunks=(min(256, cell_count), ))
            dset.attrs["last_modified"] = self._loom_timestamp()
        col_attrs_group.attrs["last_modified"] = self._loom_timestamp()

        row_attrs_group = loom_file.create_group("row_attrs")
        acc_dset = row_attrs_group.create_dataset(
            "Accession",
            data=loompy.normalize_attr_values(gene_df.index.to_numpy()),
            compression='gzip',
            compression_opts=2,
            chunks=(min(256, gene_count), ))
        acc_dset.attrs["last_modified"] = self._loom_timestamp()
        name_dset = row_attrs_group.create_dataset(
            "Gene",
            data=loompy.normalize_attr_values(
                gene_df["featurename"].to_numpy()),
            compression='gzip',
            compression_opts=2,
            chunks=(min(256, gene_count), ))
        name_dset.attrs["last_modified"] = self._loom_timestamp()

        for gene_metadata_field in gene_df:
            if gene_metadata_field == "featurename":
                continue
            gene_metadata = gene_df[gene_metadata_field]
            dset = row_attrs_group.create_dataset(
                gene_metadata_field,
                data=loompy.normalize_attr_values(gene_metadata.to_numpy()),
                compression='gzip',
                compression_opts=2,
                chunks=(min(256, gene_count), ))
            dset.attrs["last_modified"] = self._loom_timestamp()
        row_attrs_group.attrs["last_modified"] = self._loom_timestamp()

        # These two groups are defined in the spec, but matrix service outputs
        # don't use them.
        loom_file.create_group("layers")
        loom_file.create_group("row_graphs")
        loom_file.create_group("col_graphs")

        loom_file.attrs["last_modified"] = self._loom_timestamp()

        return loom_path

    def _to_csv(self):
        """Write a zip file with csvs from Redshift query manifests and readme.

        Returns:
           output_path: Path to the new zip file.
        """

        results_dir = self._make_directory()
        gene_df = self._write_out_gene_dataframe(results_dir, "genes.csv")

        cellkeys = []
        with open(os.path.join(results_dir, "expression.csv"), "w") as exp_f:
            # Write the CSV's header
            gene_index_string_list = [str(x) for x in gene_df.index.tolist()]
            exp_f.write(','.join(["cellkey"] + gene_index_string_list))
            exp_f.write('\n')

            # Iterate over the cells, reshaping the expression data for each
            # group of cells to genes are columns and cells are rows.
            for cells_df in self._generate_expression_dfs(50):
                pivoted = cells_df.pivot(
                    index="cellkey", columns="featurekey",
                    values="exprvalue").reindex(columns=gene_df.index)
                pivoted.to_csv(exp_f, header=False, na_rep='0', chunksize=50)
                cellkeys.extend(pivoted.index.to_list())

        cell_df = self.query_results[QueryType.CELL].load_results()
        self._write_out_cell_dataframe(results_dir, "cells.csv", cell_df,
                                       cellkeys)
        file_names = ["genes.csv", "expression.csv", "cells.csv"]
        zip_path = self._zip_up_matrix_output(results_dir, file_names,
                                              zipfile.ZIP_DEFLATED)
        return zip_path

    def _upload_converted_matrix(self, local_path, remote_path):
        """
        Upload the converted matrix to S3.
        Parameters
        ----------
        local_path : str
            Path to the new, converted matrix file
        remote_path : str
            S3 path where the converted matrix will be uploaded
        """
        self.FS.put(local_path, remote_path)
예제 #12
0
def post_matrix(body: dict):
    has_ids = 'bundle_fqids' in body
    has_url = 'bundle_fqids_url' in body

    format = body['format'] if 'format' in body else MatrixFormat.LOOM.value
    expected_formats = [mf.value for mf in MatrixFormat]

    # Validate input parameters
    if format not in expected_formats:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a valid `format`. "
            "Visit https://matrix.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)
    if has_ids and has_url:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply either one of `bundle_fqids` or `bundle_fqids_url`. "
            "Visit https://matrix.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)

    if not has_ids and not has_url:
        return ({
            'message':
            "Invalid parameters supplied. "
            "One of `bundle_fqids` or `bundle_fqids_url` must be supplied. "
            "Visit https://matrix.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)

    if not has_url and len(json.dumps(body['bundle_fqids'])) > 128000:
        return ({
            'message':
            "List of bundle fqids is too large. "
            "Consider using bundle_fqids_url instead. "
            "Visit https://matrix.data.humancellatlas.org for more information."
        }, requests.codes.request_entity_too_large)

    if has_url:
        bundle_fqids_url = body['bundle_fqids_url']
        bundle_fqids = None
    else:
        bundle_fqids = body['bundle_fqids']
        bundle_fqids_url = None
        if len(bundle_fqids) == 0:
            return ({
                'message':
                "Invalid parameters supplied. "
                "Please supply non empty `bundle_fqids`. "
                "Visit https://matrix.data.humancellatlas.org for more information."
            }, requests.codes.bad_request)

    request_id = str(uuid.uuid4())
    RequestTracker(request_id).initialize_request(format)
    driver_payload = {
        'request_id': request_id,
        'bundle_fqids': bundle_fqids,
        'bundle_fqids_url': bundle_fqids_url,
        'format': format,
    }
    lambda_handler.invoke(LambdaName.DRIVER_V0, driver_payload)

    return ({
        'request_id': request_id,
        'status': MatrixRequestStatus.IN_PROGRESS.value,
        'matrix_url': "",
        'eta': "",
        'message': "Job started."
    }, requests.codes.accepted)
class TestRequestTracker(MatrixTestCaseUsingMockAWS):
    @mock.patch("matrix.common.date.get_datetime_now")
    def setUp(self, mock_get_datetime_now):
        super(TestRequestTracker, self).setUp()
        self.stub_date = '2019-03-18T180907.136216Z'
        mock_get_datetime_now.return_value = self.stub_date

        self.request_id = str(uuid.uuid4())
        self.request_tracker = RequestTracker(self.request_id)
        self.dynamo_handler = DynamoHandler()

        self.create_test_request_table()

        self.dynamo_handler.create_request_table_entry(self.request_id,
                                                       "test_format")

    def test_format(self):
        self.assertEqual(self.request_tracker.format, "test_format")

    def test_batch_job_id(self):
        self.assertEqual(self.request_tracker.batch_job_id, None)

        field_enum = RequestTableField.BATCH_JOB_ID
        self.dynamo_handler.set_table_field_with_value(
            DynamoTable.REQUEST_TABLE, self.request_id, field_enum, "123-123")
        self.assertEqual(self.request_tracker.batch_job_id, "123-123")

    @mock.patch(
        "matrix.common.aws.batch_handler.BatchHandler.get_batch_job_status")
    def test_batch_job_status(self, mock_get_job_status):
        mock_get_job_status.return_value = "FAILED"
        field_enum = RequestTableField.BATCH_JOB_ID
        self.dynamo_handler.set_table_field_with_value(
            DynamoTable.REQUEST_TABLE, self.request_id, field_enum, "123-123")

        self.assertEqual(self.request_tracker.batch_job_status, "FAILED")

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.num_bundles",
        new_callable=mock.PropertyMock)
    def test_num_bundles_interval(self, mock_num_bundles):
        mock_num_bundles.return_value = 0
        self.assertEqual(self.request_tracker.num_bundles_interval, "0-499")

        mock_num_bundles.return_value = 1
        self.assertEqual(self.request_tracker.num_bundles_interval, "0-499")

        mock_num_bundles.return_value = 500
        self.assertEqual(self.request_tracker.num_bundles_interval, "500-999")

        mock_num_bundles.return_value = 1234
        self.assertEqual(self.request_tracker.num_bundles_interval,
                         "1000-1499")

    def test_creation_date(self):
        self.assertEqual(self.request_tracker.creation_date, self.stub_date)

    @mock.patch(
        "matrix.common.aws.cloudwatch_handler.CloudwatchHandler.put_metric_data"
    )
    def test_error(self, mock_cw_put):
        self.assertEqual(self.request_tracker.error, "")

        self.request_tracker.log_error("test error")
        self.assertEqual(self.request_tracker.error, "test error")
        mock_cw_put.assert_called_once_with(
            metric_name=MetricName.REQUEST_ERROR, metric_value=1)

    @mock.patch(
        "matrix.common.aws.cloudwatch_handler.CloudwatchHandler.put_metric_data"
    )
    @mock.patch(
        "matrix.common.aws.dynamo_handler.DynamoHandler.create_request_table_entry"
    )
    def test_initialize_request(self, mock_create_request_table_entry,
                                mock_create_cw_metric):
        self.request_tracker.initialize_request("test_format")

        mock_create_request_table_entry.assert_called_once_with(
            self.request_id, "test_format")
        mock_create_cw_metric.assert_called_once()

    @mock.patch(
        "matrix.common.aws.dynamo_handler.DynamoHandler.increment_table_field")
    def test_expect_subtask_execution(self, mock_increment_table_field):
        self.request_tracker.expect_subtask_execution(Subtask.DRIVER)

        mock_increment_table_field.assert_called_once_with(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.EXPECTED_DRIVER_EXECUTIONS, 1)

    @mock.patch(
        "matrix.common.aws.dynamo_handler.DynamoHandler.increment_table_field")
    def test_complete_subtask_execution(self, mock_increment_table_field):
        self.request_tracker.complete_subtask_execution(Subtask.DRIVER)

        mock_increment_table_field.assert_called_once_with(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.COMPLETED_DRIVER_EXECUTIONS, 1)

    def test_is_request_complete(self):
        self.assertFalse(self.request_tracker.is_request_complete())

        self.dynamo_handler.increment_table_field(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.COMPLETED_CONVERTER_EXECUTIONS, 1)
        self.dynamo_handler.increment_table_field(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.COMPLETED_QUERY_EXECUTIONS, 3)
        self.assertTrue(self.request_tracker.is_request_complete())

    def test_is_request_ready_for_conversion(self):
        self.assertFalse(
            self.request_tracker.is_request_ready_for_conversion())
        self.dynamo_handler.increment_table_field(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.COMPLETED_QUERY_EXECUTIONS, 3)
        self.assertTrue(self.request_tracker.is_request_ready_for_conversion())

    @mock.patch(
        "matrix.common.aws.cloudwatch_handler.CloudwatchHandler.put_metric_data"
    )
    def test_complete_request(self, mock_cw_put):
        duration = 1

        self.request_tracker.complete_request(duration)

        expected_calls = [
            mock.call(metric_name=MetricName.CONVERSION_COMPLETION,
                      metric_value=1),
            mock.call(metric_name=MetricName.REQUEST_COMPLETION,
                      metric_value=1),
            mock.call(metric_name=MetricName.DURATION,
                      metric_value=duration,
                      metric_dimensions=[
                          {
                              'Name': "Number of Bundles",
                              'Value': mock.ANY
                          },
                          {
                              'Name': "Output Format",
                              'Value': mock.ANY
                          },
                      ]),
            mock.call(metric_name=MetricName.DURATION,
                      metric_value=duration,
                      metric_dimensions=[
                          {
                              'Name': "Number of Bundles",
                              'Value': mock.ANY
                          },
                      ])
        ]
        mock_cw_put.assert_has_calls(expected_calls)

    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.log_error")
    @mock.patch(
        "matrix.common.request.request_tracker.RequestTracker.creation_date",
        new_callable=mock.PropertyMock)
    def test_timeout(self, mock_creation_date, mock_log_error):
        # no timeout
        mock_creation_date.return_value = date.to_string(
            date.get_datetime_now() - timedelta(hours=11, minutes=59))
        self.assertFalse(self.request_tracker.timeout)

        # timeout
        mock_creation_date.return_value = date.to_string(
            date.get_datetime_now() - timedelta(hours=12, minutes=1))
        self.assertTrue(self.request_tracker.timeout)
        mock_log_error.assert_called_once()

    @mock.patch(
        "matrix.common.aws.dynamo_handler.DynamoHandler.set_table_field_with_value"
    )
    def test_write_batch_job_id_to_db(self, mock_set_table_field_with_value):
        self.request_tracker.write_batch_job_id_to_db("123-123")
        mock_set_table_field_with_value.assert_called_once_with(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.BATCH_JOB_ID, "123-123")
예제 #14
0
class Driver:
    """
    Formats and stores redshift queries in s3 and sqs for execution.
    """
    def __init__(self, request_id: str, bundles_per_worker: int = 100):
        Logging.set_correlation_id(logger, value=request_id)

        self.request_id = request_id
        self.bundles_per_worker = bundles_per_worker
        self.request_tracker = RequestTracker(request_id)
        self.dynamo_handler = DynamoHandler()
        self.sqs_handler = SQSHandler()
        self.infra_config = MatrixInfraConfig()
        self.redshift_config = MatrixRedshiftConfig()
        self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET']
        self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])
        self.redshift_handler = RedshiftHandler()

    @property
    def query_job_q_url(self):
        return self.infra_config.query_job_q_url

    @property
    def redshift_role_arn(self):
        return self.redshift_config.redshift_role_arn

    def run(self, bundle_fqids: typing.List[str], bundle_fqids_url: str,
            format: str):
        """
        Initialize a matrix service request and spawn redshift queries.

        :param bundle_fqids: List of bundle fqids to be queried on
        :param bundle_fqids_url: URL from which bundle_fqids can be retrieved
        :param format: MatrixFormat file format of output expression matrix
        """
        logger.debug(
            f"Driver running with parameters: bundle_fqids={bundle_fqids}, "
            f"bundle_fqids_url={bundle_fqids_url}, format={format}, "
            f"bundles_per_worker={self.bundles_per_worker}")

        if bundle_fqids_url:
            response = self._get_bundle_manifest(bundle_fqids_url)
            resolved_bundle_fqids = self._parse_download_manifest(
                response.text)
            if len(resolved_bundle_fqids) == 0:
                error_msg = "no bundles found in the supplied bundle manifest"
                logger.info(error_msg)
                self.request_tracker.log_error(error_msg)
                return
        else:
            resolved_bundle_fqids = bundle_fqids
        logger.debug(f"resolved bundles: {resolved_bundle_fqids}")

        self.dynamo_handler.set_table_field_with_value(
            DynamoTable.REQUEST_TABLE, self.request_id,
            RequestTableField.NUM_BUNDLES, len(resolved_bundle_fqids))
        s3_obj_keys = self._format_and_store_queries_in_s3(
            resolved_bundle_fqids)

        analysis_table_bundle_count = self._fetch_bundle_count_from_analysis_table(
            resolved_bundle_fqids)
        if analysis_table_bundle_count != len(resolved_bundle_fqids):
            error_msg = "resolved bundles in request do not match bundles available in matrix service"
            logger.info(error_msg)
            self.request_tracker.log_error(error_msg)
            return

        for key in s3_obj_keys:
            self._add_request_query_to_sqs(key, s3_obj_keys[key])
        self.request_tracker.complete_subtask_execution(Subtask.DRIVER)

    @retry(reraise=True, wait=wait_fixed(5), stop=stop_after_attempt(60))
    def _get_bundle_manifest(self, bundle_fqids_url):
        response = requests.get(bundle_fqids_url)
        return response

    @staticmethod
    def _parse_download_manifest(data: str) -> typing.List[str]:
        def _parse_line(line: str) -> str:
            tokens = line.split("\t")
            return f"{tokens[0]}.{tokens[1]}"

        lines = data.splitlines()[1:]
        return list(map(_parse_line, lines))

    def _format_and_store_queries_in_s3(self, resolved_bundle_fqids: list):
        feature_query = feature_query_template.format(
            self.query_results_bucket, self.request_id, self.redshift_role_arn)
        feature_query_obj_key = self.s3_handler.store_content_in_s3(
            f"{self.request_id}/feature", feature_query)

        exp_query = expression_query_template.format(
            self.query_results_bucket, self.request_id, self.redshift_role_arn,
            format_str_list(resolved_bundle_fqids))
        exp_query_obj_key = self.s3_handler.store_content_in_s3(
            f"{self.request_id}/expression", exp_query)

        cell_query = cell_query_template.format(
            self.query_results_bucket, self.request_id, self.redshift_role_arn,
            format_str_list(resolved_bundle_fqids))
        cell_query_obj_key = self.s3_handler.store_content_in_s3(
            f"{self.request_id}/cell", cell_query)

        return {
            QueryType.CELL: cell_query_obj_key,
            QueryType.EXPRESSION: exp_query_obj_key,
            QueryType.FEATURE: feature_query_obj_key
        }

    def _add_request_query_to_sqs(self, query_type: QueryType,
                                  s3_obj_key: str):
        queue_url = self.query_job_q_url
        payload = {
            'request_id': self.request_id,
            's3_obj_key': s3_obj_key,
            'type': query_type.value
        }
        logger.debug(f"Adding {payload} to sqs {queue_url}")
        self.sqs_handler.add_message_to_queue(queue_url, payload)

    def _fetch_bundle_count_from_analysis_table(self,
                                                resolved_bundle_fqids: list):
        analysis_table_bundle_count_query = analysis_bundle_count_query_template.format(
            format_str_list(resolved_bundle_fqids))
        analysis_table_bundle_count_query = analysis_table_bundle_count_query.strip(
        ).replace('\n', '')
        results = self.redshift_handler.transaction(
            [analysis_table_bundle_count_query],
            read_only=True,
            return_results=True)
        analysis_table_bundle_count = results[0][0]
        return analysis_table_bundle_count
예제 #15
0
def post_matrix(body: dict):

    feature = body.get("feature", constants.DEFAULT_FEATURE)
    fields = body.get("fields", constants.DEFAULT_FIELDS)
    format_ = body['format'] if 'format' in body else MatrixFormat.LOOM.value
    expected_formats = [mf.value for mf in MatrixFormat]

    # Validate input parameters
    if format_ not in expected_formats:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a valid `format`. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)
    if "filter" not in body:
        return ({
            'message':
            "Invalid parameters supplied. "
            "Please supply a filter. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.bad_request)

    if len(json.dumps(body["filter"])) > 128000:
        return ({
            'message':
            "The filter specification is too large. "
            "Visit https://matrix.dev.data.humancellatlas.org for more information."
        }, requests.codes.request_entity_too_large)

    if query_constructor.has_genus_species_term(body["filter"]):
        # If the user has mentioned something about species, then maybe
        # they're looking for non-human data. So we'll run queries for all
        # the species that we know about.
        genera_species = list(constants.GenusSpecies)
    else:
        # Otherwise, default to human-only
        genera_species = [constants.GenusSpecies.HUMAN]

    # Cell barcode is required in .mtx matrices by 10x specifications
    if format_ == MatrixFormat.MTX.value and "cell.barcode" not in fields and "barcode" not in fields:
        fields.append("cell.barcode")

    human_request_id = ""
    non_human_request_ids = {}
    for genus_species in genera_species:
        request_id = str(uuid.uuid4())
        RequestTracker(request_id).initialize_request(format_, fields, feature,
                                                      genus_species)

        driver_payload = {
            'request_id': request_id,
            'filter': body["filter"],
            'fields': fields,
            'feature': feature,
            'genus_species': genus_species.value
        }
        lambda_handler.invoke(LambdaName.DRIVER_V1, driver_payload)

        if genus_species == GenusSpecies.HUMAN:
            human_request_id = request_id
        else:
            non_human_request_ids[genus_species.value] = request_id

    return ({
        'request_id': human_request_id,
        'non_human_request_ids': non_human_request_ids,
        'status': MatrixRequestStatus.IN_PROGRESS.value,
        'message': "Job started."
    }, requests.codes.accepted)
예제 #16
0
class Driver:
    """
    Formats and stores redshift queries in s3 and sqs for execution.
    """
    def __init__(self, request_id: str):
        Logging.set_correlation_id(logger, value=request_id)

        self.request_id = request_id
        self.request_tracker = RequestTracker(request_id)
        self.dynamo_handler = DynamoHandler()
        self.sqs_handler = SQSHandler()
        self.infra_config = MatrixInfraConfig()
        self.redshift_config = MatrixRedshiftConfig()
        self.query_results_bucket = os.environ['MATRIX_QUERY_RESULTS_BUCKET']
        self.s3_handler = S3Handler(os.environ['MATRIX_QUERY_BUCKET'])

    @property
    def query_job_q_url(self):
        return self.infra_config.query_job_q_url

    @property
    def redshift_role_arn(self):
        return self.redshift_config.redshift_role_arn

    def run(self, filter_: typing.Dict[str, typing.Any], fields: typing.List[str], feature: str):
        """
        Initialize a matrix service request and spawn redshift queries.

        :param filter_: Filter dict describing which cells to get expression data for
        :param fields: Which metadata fields to return
        :param format: MatrixFormat file format of output expression matrix
        :param feature: Which feature (gene vs transcript) to include in output
        """
        logger.debug(f"Driver running with parameters: filter={filter_}, "
                     f"fields={fields}, feature={feature}")

        try:
            matrix_request_queries = query_constructor.create_matrix_request_queries(
                filter_, fields, feature)
        except (query_constructor.MalformedMatrixFilter, query_constructor.MalformedMatrixFeature) as exc:
            self.request_tracker.log_error(f"Query construction failed with error: {str(exc)}")
            raise

        s3_obj_keys = self._format_and_store_queries_in_s3(matrix_request_queries)
        for key in s3_obj_keys:
            self._add_request_query_to_sqs(key, s3_obj_keys[key])
        self.request_tracker.complete_subtask_execution(Subtask.DRIVER)

    def _format_and_store_queries_in_s3(self, queries: dict):
        feature_query = queries[QueryType.FEATURE].format(results_bucket=self.query_results_bucket,
                                                          request_id=self.request_id,
                                                          iam_role=self.redshift_role_arn)
        feature_query_obj_key = self.s3_handler.store_content_in_s3(f"{self.request_id}/{QueryType.FEATURE.value}",
                                                                    feature_query)

        exp_query = queries[QueryType.EXPRESSION].format(results_bucket=self.query_results_bucket,
                                                         request_id=self.request_id,
                                                         iam_role=self.redshift_role_arn)
        exp_query_obj_key = self.s3_handler.store_content_in_s3(f"{self.request_id}/{QueryType.EXPRESSION.value}",
                                                                exp_query)

        cell_query = queries[QueryType.CELL].format(results_bucket=self.query_results_bucket,
                                                    request_id=self.request_id,
                                                    iam_role=self.redshift_role_arn)
        cell_query_obj_key = self.s3_handler.store_content_in_s3(f"{self.request_id}/{QueryType.CELL.value}",
                                                                 cell_query)

        return {
            QueryType.CELL: cell_query_obj_key,
            QueryType.EXPRESSION: exp_query_obj_key,
            QueryType.FEATURE: feature_query_obj_key
        }

    def _add_request_query_to_sqs(self, query_type: QueryType, s3_obj_key: str):
        queue_url = self.query_job_q_url
        payload = {
            'request_id': self.request_id,
            's3_obj_key': s3_obj_key,
            'type': query_type.value
        }
        logger.debug(f"Adding {payload} to sqs {queue_url}")
        self.sqs_handler.add_message_to_queue(queue_url, payload)
예제 #17
0
def get_matrix(request_id: str):

    # There are a few cases to handle here. First, if the request_id is not in
    # the state table at all, then this id has never been made and we should
    # 404.
    request_tracker = RequestTracker(request_id)
    if not request_tracker.is_initialized:
        return ({
            'message': f"Unable to find job with request ID {request_id}."
        }, requests.codes.not_found)

    in_progress_response = ({
        'request_id':
        request_id,
        'status':
        MatrixRequestStatus.IN_PROGRESS.value,
        'matrix_url':
        "",
        'eta':
        "",
        'message':
        f"Request {request_id} has been accepted and is currently being "
        f"processed. Please try again later."
    }, requests.codes.ok)

    # if the request tracker is not able to retrieve the format,
    # it means that the driver has not created the relevant entry in the output table yet.
    try:
        format = request_tracker.format
    except MatrixException:
        return in_progress_response

    # Failed case
    if request_tracker.error:
        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.FAILED.value,
            'matrix_url': "",
            'eta': "",
            'message': request_tracker.error
        }, requests.codes.ok)
    # Check for failed batch conversion job
    elif request_tracker.batch_job_status and request_tracker.batch_job_status == "FAILED":
        request_tracker.log_error(
            "The matrix conversion as a part of the request has failed. \
            Please retry or contact an hca admin for help.")
        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.FAILED.value,
            'matrix_url': "",
            'eta': "",
            'message': request_tracker.error
        }, requests.codes.ok)

    # Complete case
    elif request_tracker.is_request_complete():
        matrix_results_bucket = os.environ['MATRIX_RESULTS_BUCKET']
        matrix_results_handler = S3Handler(matrix_results_bucket)

        matrix_key = ""
        if format == MatrixFormat.LOOM.value:
            matrix_key = f"{request_tracker.s3_results_prefix}/{request_id}.{format}"
        elif format == MatrixFormat.CSV.value or format == MatrixFormat.MTX.value:
            matrix_key = f"{request_tracker.s3_results_prefix}/{request_id}.{format}.zip"

        matrix_location = f"https://s3.amazonaws.com/{matrix_results_bucket}/{matrix_key}"

        is_empty = False
        if not matrix_results_handler.size(matrix_key):
            is_empty = True
            matrix_location = ""

        if not is_empty:
            message = (
                f"Request {request_id} has successfully completed. "
                f"The resultant expression matrix is available for download at "
                f"{matrix_location}.")
        else:
            message = (
                f"Request {request_id} has successfully completed. "
                f"But, there were no cells associated with this request and "
                f"species {request_tracker.genus_species.value}")

        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.COMPLETE.value,
            'matrix_url': matrix_location,
            'eta': "",
            'message': message
        }, requests.codes.ok)

    # Expired case
    elif request_tracker.is_expired:
        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.EXPIRED.value,
            'matrix_url': "",
            'eta': "",
            'message': request_tracker.error
        }, requests.codes.ok)

    # Timeout case
    elif request_tracker.timeout:

        return ({
            'request_id': request_id,
            'status': MatrixRequestStatus.FAILED.value,
            'matrix_url': "",
            'eta': "",
            'message': request_tracker.error
        }, requests.codes.ok)
    else:
        return in_progress_response
예제 #18
0
class MatrixConverter:

    def __init__(self, args):
        self.args = args
        self.format = args.format
        self.request_tracker = RequestTracker(args.request_id)
        self.query_results = {}

        self.local_output_filename = os.path.basename(os.path.normpath(args.target_path))
        self.target_path = args.target_path
        self.working_dir = args.working_dir
        self.FS = s3fs.S3FileSystem()

        Logging.set_correlation_id(LOGGER, value=args.request_id)

    def run(self):
        try:
            LOGGER.debug(f"Beginning matrix conversion run for {self.args.request_id}")
            self.query_results = {
                QueryType.CELL: CellQueryResultsReader(self.args.cell_metadata_manifest_key),
                QueryType.EXPRESSION: ExpressionQueryResultsReader(self.args.expression_manifest_key),
                QueryType.FEATURE: FeatureQueryResultsReader(self.args.gene_metadata_manifest_key)
            }

            LOGGER.debug(f"Beginning conversion to {self.format}")
            local_converted_path = getattr(self, f"_to_{self.format}")()
            LOGGER.debug(f"Conversion to {self.format} completed")

            LOGGER.debug(f"Beginning upload to S3")
            self._upload_converted_matrix(local_converted_path, self.target_path)
            LOGGER.debug("Upload to S3 complete, job finished")

            os.remove(local_converted_path)

            self.request_tracker.complete_subtask_execution(Subtask.CONVERTER)
            self.request_tracker.complete_request(duration=(date.get_datetime_now()
                                                            - date.to_datetime(self.request_tracker.creation_date))
                                                  .total_seconds())
        except Exception as e:
            LOGGER.info(f"Matrix Conversion failed on {self.args.request_id} with error {str(e)}")
            self.request_tracker.log_error(str(e))
            raise e

    def _n_slices(self):
        """Return the number of slices associated with this Redshift result.

        Redshift UNLOAD creates on object per "slice" of the cluster. We might want to
        iterate over that, so this get the count of them.
        """
        return len(self.query_results[QueryType.CELL].manifest["part_urls"])

    def _make_directory(self):
        if not self.local_output_filename.endswith(".zip"):
            self.local_output_filename += ".zip"
        results_dir = os.path.join(self.working_dir,
                                   os.path.splitext(self.local_output_filename)[0])
        os.makedirs(results_dir)
        return results_dir

    def _zip_up_matrix_output(self, results_dir, matrix_file_names):
        zipf = zipfile.ZipFile(os.path.join(self.working_dir, self.local_output_filename), 'w')
        for filename in matrix_file_names:
            zipf.write(os.path.join(results_dir, filename),
                       arcname=os.path.join(os.path.basename(results_dir),
                                            filename))
        zipf.close()
        shutil.rmtree(results_dir)
        return os.path.join(self.working_dir, self.local_output_filename)

    def _write_out_gene_dataframe(self, results_dir, output_filename, compression=False):
        gene_df = self.query_results[QueryType.FEATURE].load_results()
        if compression:
            gene_df.to_csv(os.path.join(results_dir, output_filename),
                           index_label="featurekey",
                           sep="\t", compression="gzip")
        else:
            gene_df.to_csv(os.path.join(results_dir, output_filename), index_label="featurekey")
        return gene_df

    def _write_out_cell_dataframe(self, results_dir, output_filename, cell_df, cellkeys, compression=False):
        cell_df = cell_df.reindex(index=cellkeys)
        if compression:
            cell_df.to_csv(os.path.join(results_dir, output_filename),
                           sep='\t',
                           index_label="cellkey", compression="gzip")
        else:
            cell_df.to_csv(os.path.join(results_dir, output_filename), index_label="cellkey")
        return cell_df

    def _to_mtx(self):
        """Write a zip file with an mtx and two metadata tsvs from Redshift query
        manifests.

        Returns:
           output_path: Path to the zip file.
        """
        results_dir = self._make_directory()
        gene_df = self._write_out_gene_dataframe(results_dir, "genes.tsv.gz", compression=True)
        cell_df = self.query_results[QueryType.CELL].load_results()

        # To follow 10x conventions, features are rows and cells are columns
        n_rows = gene_df.shape[0]
        n_cols = cell_df.shape[0]
        n_nonzero = self.query_results[QueryType.EXPRESSION].manifest["record_count"]

        cellkeys = []

        with gzip.open(os.path.join(results_dir, "matrix.mtx.gz"), "w", compresslevel=4) as exp_f:
            # Write the mtx header
            exp_f.write("%%MatrixMarket matrix coordinate real general\n".encode())
            exp_f.write(f"{n_rows} {n_cols} {n_nonzero}\n".encode())

            cell_count = 0
            for slice_idx in range(self._n_slices()):
                for chunk in self.query_results[QueryType.EXPRESSION].load_slice(slice_idx):

                    grouped = chunk.groupby("cellkey")
                    for cell_group in grouped:
                        single_cell_df = cell_group[1]
                        single_cell_coo = single_cell_df.pivot(
                            index="featurekey", columns="cellkey", values="exprvalue").reindex(
                            index=gene_df.index).to_sparse().to_coo()

                        for row, col, value in zip(single_cell_coo.row, single_cell_coo.col, single_cell_coo.data):
                            exp_f.write(f"{row + 1} {col + cell_count + 1} {value}\n".encode())
                        cell_count += 1

                        cellkeys.append(cell_group[0])

        self._write_out_cell_dataframe(results_dir, "cells.tsv.gz", cell_df, cellkeys, compression=True)
        file_names = ["genes.tsv.gz", "matrix.mtx.gz", "cells.tsv.gz"]
        zip_path = self._zip_up_matrix_output(results_dir, file_names)
        return zip_path

    def _to_loom(self):
        """Write a loom file from Redshift query manifests.

        Returns:
           output_path: Path to the new loom file.
        """

        # Put loom on the output filename if it's not already there.
        if not self.local_output_filename.endswith(".loom"):
            self.local_output_filename += ".loom"

        # Read the row (gene) attributes and then set some conventional names
        gene_df = self.query_results[QueryType.FEATURE].load_results()
        gene_df["featurekey"] = gene_df.index
        row_attrs = gene_df.to_dict("series")
        # Not expected to be unique
        row_attrs["Gene"] = row_attrs.pop("featurename")
        row_attrs["Accession"] = row_attrs.pop("featurekey")
        for key, val in row_attrs.items():
            row_attrs[key] = val.values

        loom_parts = []
        loom_part_dir = os.path.join(self.working_dir, ".loom_parts")

        if os.path.exists(loom_part_dir):
            shutil.rmtree(loom_part_dir)

        os.makedirs(loom_part_dir)

        # Iterate over the "slices" produced by the redshift query
        for slice_idx in range(self._n_slices()):

            # Get the cell metadata for all the cells in this slice
            cell_df = self.query_results[QueryType.CELL].load_slice(slice_idx)

            # Iterate over fixed-size chunks of expression data from this
            # slice.
            chunk_idx = 0
            for chunk in self.query_results[QueryType.EXPRESSION].load_slice(slice_idx):
                print(f"Loading chunk {chunk_idx} from slice {slice_idx}")
                sparse_cell_dfs = []

                # Group the data by cellkey and iterate over each cell
                grouped = chunk.groupby("cellkey")
                for cell_group in grouped:
                    single_cell_df = cell_group[1]

                    # Reshape the dataframe so cellkey is a column and features
                    # are rows. Reindex so all dataframes have the same row
                    # order, and then sparsify because this is a very empty
                    # dataset usually.
                    sparse_cell_dfs.append(single_cell_df
                                           .pivot(index="featurekey", columns="cellkey", values="exprvalue")
                                           .reindex(index=row_attrs["Accession"]).to_sparse())

                # Concatenate the cell dataframes together. This is what we'll
                # write to disk.
                if not sparse_cell_dfs:
                    continue
                sparse_expression_matrix = pandas.concat(sparse_cell_dfs, axis=1, copy=True)

                # Get the cell metadata dataframe for just the cell in this
                # chunk
                chunk_cell_df = cell_df.reindex(index=sparse_expression_matrix.columns)
                chunk_cell_df["cellkey"] = chunk_cell_df.index
                for col in chunk_cell_df.columns:
                    if chunk_cell_df[col].dtype.name == "category":
                        chunk_cell_df[col] = chunk_cell_df[col].astype("object")
                col_attrs = chunk_cell_df.to_dict("series")
                col_attrs["CellID"] = col_attrs.pop("cellkey")

                # Just a thing you have to do...
                for key, val in col_attrs.items():
                    col_attrs[key] = val.values

                # Write the data from this chunk to its own file.
                loom_part_path = os.path.join(loom_part_dir,
                                              f"matrix.{slice_idx}.{chunk_idx}.loom")
                print(f"Writing to {loom_part_path}")
                loompy.create(
                    loom_part_path, sparse_expression_matrix.to_coo(), row_attrs, col_attrs)
                loom_parts.append(loom_part_path)
                chunk_idx += 1

        # Using the loompy method, combine all the chunks together into a
        # single file.
        print(f"Parts complete. Writing to {self.local_output_filename}")
        loompy.combine(loom_parts,
                       key="Accession",
                       output_file=os.path.join(self.working_dir, self.local_output_filename))
        shutil.rmtree(loom_part_dir)

        return os.path.join(self.working_dir, self.local_output_filename)

    def _to_csv(self):
        """Write a zip file with csvs from Redshift query manifests and readme.

        Returns:
           output_path: Path to the new zip file.
        """

        results_dir = self._make_directory()
        gene_df = self._write_out_gene_dataframe(results_dir, "genes.csv")
        cell_df = self.query_results[QueryType.CELL].load_results()

        cellkeys = []
        with open(os.path.join(results_dir, "expression.csv"), "w") as exp_f:
            # Write the CSV's header
            gene_index_string_list = [str(x) for x in gene_df.index.tolist()]
            exp_f.write(','.join(["cellkey"] + gene_index_string_list))
            exp_f.write('\n')

            for slice_idx in range(self._n_slices()):
                for chunk in self.query_results[QueryType.EXPRESSION].load_slice(slice_idx):
                    # Group the data by cellkey and iterate over each cell
                    grouped = chunk.groupby("cellkey")
                    for cell_group in grouped:
                        single_cell_df = cell_group[1]
                        single_cell_df.pivot(
                            index="cellkey", columns="featurekey", values="exprvalue").reindex(
                            columns=gene_df.index).to_csv(exp_f, header=False, na_rep='0')
                        cellkeys.append(cell_group[0])

        self._write_out_cell_dataframe(results_dir, "cells.csv", cell_df, cellkeys)
        file_names = ["genes.csv", "expression.csv", "cells.csv"]
        zip_path = self._zip_up_matrix_output(results_dir, file_names)
        return zip_path

    def _upload_converted_matrix(self, local_path, remote_path):
        """
        Upload the converted matrix to S3.
        Parameters
        ----------
        local_path : str
            Path to the new, converted matrix file
        remote_path : str
            S3 path where the converted matrix will be uploaded
        """
        self.FS.put(local_path, remote_path)