def test_validating_status_file_validation(self, mock_format_and_send_notification):
     validation_id = str(uuid.uuid4())
     orig_val_id = str(uuid.uuid4())
     area_id = self._create_area()
     s3obj = self.mock_upload_file_to_s3(area_id, 'foo.json')
     upload_area = UploadArea(area_id)
     uploaded_file = UploadedFile(upload_area, s3object=s3obj)
     validation_event = ValidationEvent(file_ids=[uploaded_file.db_id],
                                        validation_id=validation_id,
                                        job_id='12345',
                                        status="SCHEDULED",
                                        docker_image="test_docker_image",
                                        original_validation_id=orig_val_id)
     validation_event.create_record()
     data = {
         "status": "VALIDATING",
         "job_id": validation_event.job_id,
         "payload": uploaded_file.info()
     }
     response = self.client.post(f"/v1/area/{area_id}/update_validation/{validation_id}",
                                 headers=self.authentication_header,
                                 data=json.dumps(data))
     self.assertEqual(204, response.status_code)
     record = UploadDB().get_pg_record("validation", validation_id)
     self.assertEqual("test_docker_image", record["docker_image"])
     self.assertEqual(validation_id, record["id"])
     self.assertEqual(orig_val_id, record["original_validation_id"])
     self.assertEqual("VALIDATING", record["status"])
     self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_started_at"))))
     self.assertEqual(None, record["validation_ended_at"])
     self.assertEqual(None, record.get("results"))
     response = self.client.get(f"/v1/area/{area_id}/foo.json/validate")
     validation_status = response.get_json()['validation_status']
     self.assertEqual(validation_status, "VALIDATING")
     mock_format_and_send_notification.assert_not_called()
    def test_schedule_validation__for_multiple_files__is_successful(self):
        area_id = self._create_area()
        self.mock_upload_file_to_s3(area_id, 'foo.json')
        self.mock_upload_file_to_s3(area_id, 'foo2.json')

        payload = {
            'validator_image': "humancellatlas/upload-validator-example",
            'files': ['foo.json', 'foo2.json']
        }
        response = self.client.put(
            f"/v1/area/{area_id}/validate",
            headers=self.authentication_header,
            json=payload
        )

        self.assertEqual(response.status_code, 200)
        validation_id = response.json['validation_id']
        validation_record = UploadDB().get_pg_record("validation", validation_id)
        self.assertEqual(validation_record['status'], "SCHEDULING_QUEUED")
        validation_files_records = UploadDB().get_pg_records("validation_files", validation_id, column='validation_id')
        file_one_record = UploadDB().get_pg_record("file", f"{area_id}/foo.json", "s3_key")
        file_two_record = UploadDB().get_pg_record("file", f"{area_id}/foo2.json", "s3_key")
        self.assertEqual(len(validation_files_records), 2)
        validation_file_db_ids = [record['file_id'] for record in validation_files_records]
        self.assertEqual(file_one_record['id'] in validation_file_db_ids, True)
        self.assertEqual(file_two_record['id'] in validation_file_db_ids, True)
Пример #3
0
 def __init__(self):
     self.api_key = os.environ["INGEST_API_KEY"]
     self.deployment_stage = os.environ["DEPLOYMENT_STAGE"]
     self.api_host = os.environ["API_HOST"]
     self.batch_client = boto3.client("batch")
     self.ec2_client = boto3.client('ec2')
     self.lambda_client = boto3.client('lambda')
     self.db = UploadDB()
    def test_update_event_with_validation_event(self, mock_format_and_send_notification):

        validation_id = str(uuid.uuid4())
        area_id = self._create_area()
        s3obj = self.mock_upload_file_to_s3(area_id, 'foo.json')
        upload_area = UploadArea(area_id)
        uploaded_file = UploadedFile(upload_area, s3object=s3obj)
        validation_event = ValidationEvent(file_ids=[uploaded_file.db_id],
                                           validation_id=validation_id,
                                           job_id='12345',
                                           status="SCHEDULED")
        validation_event.create_record()
        validation_event.status = "VALIDATING"
        response = update_event(validation_event, uploaded_file.info(), self.client)
        self.assertEqual(204, response.status_code)
        record = UploadDB().get_pg_record("validation", validation_id)
        self.assertEqual("VALIDATING", record["status"])
        self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_started_at"))))
        self.assertEqual(None, record["validation_ended_at"])
        self.assertEqual(None, record.get("results"))

        validation_event.status = "VALIDATED"
        response = update_event(validation_event, uploaded_file.info(), self.client)
        self.assertEqual(204, response.status_code)
        record = UploadDB().get_pg_record("validation", validation_id)
        self.assertEqual("VALIDATED", record["status"])
        self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_started_at"))))
        self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_ended_at"))))
        self.assertEqual(uploaded_file.info(), record.get("results"))
Пример #5
0
    def setUp(self):
        super().setUp()
        self.area_uuid = str(uuid.uuid4())
        self.upload_area = UploadArea(self.area_uuid)
        self.db = UploadDB()

        self.db.create_pg_record(
            "upload_area", {
                "uuid": self.area_uuid,
                "status": "UNLOCKED",
                "bucket_name": self.upload_config.bucket_name
            })
Пример #6
0
    def test_add_to_validation_sqs__adds_correct_event_to_queue(self):
        uploaded_file = UploadedFile.create(
            upload_area=self.upload_area,
            name="file2",
            content_type="application/octet-stream; dcp-type=data",
            data="file2_content")
        validation_scheduler = ValidationScheduler(self.upload_area_id,
                                                   [uploaded_file])

        validation_uuid = validation_scheduler.add_to_validation_sqs(
            ["filename123"], "test_docker_image", {"variable": "variable"},
            "123456")

        message = self.sqs.meta.client.receive_message(
            QueueUrl='test_validation_q_url')
        message_body = json.loads(message['Messages'][0]['Body'])
        record = UploadDB().get_pg_record("validation",
                                          validation_uuid,
                                          column='id')
        self.assertEqual(message_body["filenames"], ["filename123"])
        self.assertEqual(message_body["validation_id"], validation_uuid)
        self.assertEqual(message_body["validator_docker_image"],
                         "test_docker_image")
        self.assertEqual(message_body["environment"], {"variable": "variable"})
        self.assertEqual(message_body["orig_validation_id"], "123456")
        self.assertEqual(message_body["upload_area_uuid"],
                         uploaded_file.upload_area.uuid)
        self.assertEqual(record["status"], "SCHEDULING_QUEUED")
 def test_validated_status_file_validation(self, mock_format_and_send_notification):
     validation_id = str(uuid.uuid4())
     area_id = self._create_area()
     s3obj = self.mock_upload_file_to_s3(area_id, 'foo.json')
     upload_area = UploadArea(area_id)
     uploaded_file = UploadedFile(upload_area, s3object=s3obj)
     validation_event = ValidationEvent(file_ids=[uploaded_file.db_id],
                                        validation_id=validation_id,
                                        job_id='12345',
                                        status="SCHEDULED",
                                        docker_image="test_docker_image")
     validation_event.create_record()
     data = {
         "status": "VALIDATING",
         "job_id": validation_event.job_id,
         "payload": uploaded_file.info()
     }
     response = self.client.post(f"/v1/area/{area_id}/update_validation/{validation_id}",
                                 headers=self.authentication_header,
                                 data=json.dumps(data))
     data = {
         "status": "VALIDATED",
         "job_id": validation_event.job_id,
         "payload": uploaded_file.info()
     }
     response = self.client.post(f"/v1/area/{area_id}/update_validation/{validation_id}",
                                 headers=self.authentication_header,
                                 data=json.dumps(data))
     self.assertEqual(204, response.status_code)
     mock_format_and_send_notification.assert_called_once_with({
         'upload_area_id': area_id,
         'name': 'foo.json',
         'size': 3,
         'last_modified': s3obj.last_modified.isoformat(),
         'content_type': "application/json",
         'url': f"s3://{self.upload_config.bucket_name}/{area_id}/foo.json",
         'checksums': {'s3_etag': '1', 'sha1': '2', 'sha256': '3', 'crc32c': '4'}
     })
     record = UploadDB().get_pg_record("validation", validation_id)
     self.assertEqual("VALIDATED", record["status"])
     self.assertEqual("test_docker_image", record["docker_image"])
     self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_started_at"))))
     self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_ended_at"))))
     self.assertEqual(uploaded_file.info(), record.get("results"))
     response = self.client.get(f"/v1/area/{area_id}/foo.json/validate")
     validation_status = response.get_json()['validation_status']
     self.assertEqual(validation_status, "VALIDATED")
    def test_update_event_with_checksum_event(self, mock_format_and_send_notification):

        checksum_id = str(uuid.uuid4())
        area_uuid = self._create_area()
        s3obj = self.mock_upload_file_to_s3(area_uuid, 'foo.json')
        upload_area = UploadArea(area_uuid)
        uploaded_file = UploadedFile(upload_area, s3object=s3obj)
        checksum_event = ChecksumEvent(file_id=uploaded_file.db_id,
                                       checksum_id=checksum_id,
                                       job_id='12345',
                                       status="SCHEDULED")
        checksum_event.create_record()

        checksum_event.status = "CHECKSUMMING"
        response = update_event(checksum_event, uploaded_file.info(), self.client)
        self.assertEqual(204, response.status_code)
        record = UploadDB().get_pg_record("checksum", checksum_id)
        self.assertEqual("CHECKSUMMING", record["status"])
        self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("checksum_started_at"))))
        self.assertEqual(None, record["checksum_ended_at"])

        checksum_event.status = "CHECKSUMMED"
        response = update_event(checksum_event, uploaded_file.info(), self.client)
        self.assertEqual(204, response.status_code)
        record = UploadDB().get_pg_record("checksum", checksum_id)
        self.assertEqual("CHECKSUMMED", record["status"])
        self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("checksum_started_at"))))
        self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("checksum_ended_at"))))
Пример #9
0
def health():
    """
    This api endpoint is invoked by the dcp wide status monitoring system.
    This function checks the health of underlying api gateway and db infrastructure.
    Running a simple query confirms that ecs pgbouncer is up running and talking to rds.
    """
    db_health_check_query = "SELECT count(*) from upload_area;"
    UploadDB().run_query(db_health_check_query)
    return requests.codes.ok
Пример #10
0
class TestDatabase(UploadTestCaseUsingMockAWS):
    def setUp(self):
        super().setUp()
        self.area_uuid = str(uuid.uuid4())
        self.upload_area = UploadArea(self.area_uuid)
        self.db = UploadDB()

        self.db.create_pg_record(
            "upload_area", {
                "uuid": self.area_uuid,
                "status": "UNLOCKED",
                "bucket_name": self.upload_config.bucket_name
            })

    def test_get_pg_record(self):
        result = self.db.get_pg_record("upload_area",
                                       self.area_uuid,
                                       column='uuid')

        self.assertEqual(result["uuid"], self.area_uuid)
        self.assertEqual(result["bucket_name"], self.upload_config.bucket_name)
        self.assertEqual(result["status"], "UNLOCKED")

    def test_update_pg_record(self):
        before = self.db.get_pg_record("upload_area",
                                       self.area_uuid,
                                       column='uuid')
        self.assertEqual(before["status"], "UNLOCKED")

        self.db.update_pg_record("upload_area", {
            "uuid": self.area_uuid,
            "status": "LOCKED",
            "bucket_name": self.upload_config.bucket_name
        },
                                 column='uuid')

        after = self.db.get_pg_record("upload_area",
                                      self.area_uuid,
                                      column='uuid')
        self.assertEqual(after["uuid"], self.area_uuid)
        self.assertEqual(after["bucket_name"], self.upload_config.bucket_name)
        self.assertEqual(after["status"], "LOCKED")

    def test_get_pg_records(self):
        results = self.db.get_pg_records("upload_area",
                                         self.area_uuid,
                                         column='uuid')

        self.assertEqual(results[0]["uuid"], self.area_uuid)
        self.assertEqual(results[0]["bucket_name"],
                         self.upload_config.bucket_name)
        self.assertEqual(results[0]["status"], "UNLOCKED")
Пример #11
0
    def test_add_upload_area_to_delete_sqs(self):
        area_uuid = self._create_area()

        UploadArea(area_uuid).add_upload_area_to_delete_sqs()
        message = self.sqs.meta.client.receive_message(
            QueueUrl='delete_sqs_url')

        message_body = json.loads(message['Messages'][0]['Body'])
        self.assertEqual(message_body['area_uuid'], area_uuid)
        record = UploadDB().get_pg_record("upload_area",
                                          area_uuid,
                                          column='uuid')
        self.assertEqual(record['status'], "DELETION_QUEUED")
Пример #12
0
    def test_upload_area_delete_over_timeout(self,
                                             mock_retrieve_lambda_timeout):
        area_uuid = self._create_area()
        obj = self.upload_bucket.Object(f'{area_uuid}/test_file')
        obj.put(Body="foo")
        mock_retrieve_lambda_timeout.return_value = 0

        area = UploadArea(area_uuid)
        area.delete()

        record = UploadDB().get_pg_record("upload_area",
                                          area_uuid,
                                          column='uuid')
        self.assertEqual("DELETION_QUEUED", record["status"])
Пример #13
0
    def test_delete_with_id_of_real_non_empty_upload_area(self):
        area_uuid = self._create_area()

        obj = self.upload_bucket.Object(f'{area_uuid}/test_file')
        obj.put(Body="foo")

        response = self.client.delete(f"/v1/area/{area_uuid}",
                                      headers=self.authentication_header)

        self.assertEqual(202, response.status_code)
        record = UploadDB().get_pg_record("upload_area",
                                          area_uuid,
                                          column='uuid')
        self.assertEqual("DELETION_QUEUED", record["status"])
Пример #14
0
    def test_locking_of_upload_area(self):
        area_uuid = self._create_area()
        record = UploadDB().get_pg_record("upload_area",
                                          area_uuid,
                                          column='uuid')
        self.assertEqual("UNLOCKED", record["status"])

        response = self.client.post(f"/v1/area/{area_uuid}/lock",
                                    headers=self.authentication_header)

        self.assertEqual(204, response.status_code)
        record = UploadDB().get_pg_record("upload_area",
                                          area_uuid,
                                          column='uuid')
        self.assertEqual("LOCKED", record["status"])

        response = self.client.delete(f"/v1/area/{area_uuid}/lock",
                                      headers=self.authentication_header)

        self.assertEqual(204, response.status_code)
        record = UploadDB().get_pg_record("upload_area",
                                          area_uuid,
                                          column='uuid')
        self.assertEqual("UNLOCKED", record["status"])
Пример #15
0
    def test_create_with_unused_upload_area_uuid(self):
        area_uuid = str(uuid.uuid4())

        response = self.client.post(f"/v1/area/{area_uuid}",
                                    headers=self.authentication_header)

        self.assertEqual(201, response.status_code)
        body = json.loads(response.data)
        self.assertEqual(
            {'uri': f"s3://{self.upload_config.bucket_name}/{area_uuid}/"},
            body)

        record = UploadDB().get_pg_record("upload_area",
                                          area_uuid,
                                          column='uuid')
        self.assertEqual(area_uuid, record["uuid"])
        self.assertEqual(self.upload_config.bucket_name, record["bucket_name"])
        self.assertEqual("UNLOCKED", record["status"])
    def test_schedule_validation__with_original_validation_id__retains_original_validation_id(self):
        area_id = self._create_area()
        self.mock_upload_file_to_s3(area_id, 'foo.json')
        self.mock_upload_file_to_s3(area_id, 'foo2.json')

        payload = {
            'validator_image': "humancellatlas/upload-validator-example",
            'files': ['foo.json', 'foo2.json'],
            'original_validation_id': '123456'
        }
        response = self.client.put(
            f"/v1/area/{area_id}/validate",
            headers=self.authentication_header,
            json=payload
        )

        self.assertEqual(200, response.status_code)
        validation_id = response.json['validation_id']
        validation_record = UploadDB().get_pg_record("validation", validation_id)
        self.assertEqual(validation_record['status'], "SCHEDULING_QUEUED")
        self.assertEqual(validation_record['original_validation_id'], "123456")
Пример #17
0
    def test_format_and_send_notification(self, mock_send_notification):
        area_uuid = str(uuid.uuid4())
        upload_area = UploadArea(area_uuid)
        upload_area.update_or_create()
        upload_area._db_load()
        file = upload_area.store_file("test_file_name", "test_file_content",
                                      "application/json; dcp-type=data")
        ingest_notifier = IngestNotifier("file_uploaded", file_id=file.db_id)

        test_payload = {
            'names': "[test_file_name]",
            'upload_area_id': area_uuid
        }
        notification_id = ingest_notifier.format_and_send_notification(
            test_payload)

        record = UploadDB().get_pg_record("notification",
                                          notification_id,
                                          column="id")
        self.assertEqual(record['status'], "DELIVERED")
        self.assertEqual(record['file_id'], file.db_id)
        self.assertEqual(record['payload'], test_payload)
Пример #18
0
    def __init__(self):
        self.env = os.environ['DEPLOYMENT_STAGE']
        self.db = UploadDB()
        logger.debug(
            f"Running a health check for {self.env}. Results will be posted in #upload-service"
        )
        self.webhook = UploadConfig().slack_webhook

        self.stale_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \
                                              "WHERE status='CHECKSUMMING' " \
                                              "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                              "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.stale_validation_job_count_query = "SELECT COUNT(*) FROM validation " \
                                                "WHERE status='VALIDATING' " \
                                                "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.scheduled_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \
                                                  "WHERE status='SCHEDULED' " \
                                                  "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                  "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.scheduled_validation_job_count_query = "SELECT COUNT(*) FROM validation " \
                                                    "WHERE status='SCHEDULED' " \
                                                    "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                    "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.undeleted_areas_count_query = "SELECT COUNT(*) FROM upload_area " \
                                           "WHERE created_at > CURRENT_DATE - interval '4 weeks' " \
                                           "AND status != 'DELETED'"
        self.failed_checksum_count_query = "SELECT COUNT(*) FROM checksum " \
                                           "WHERE status='FAILED' " \
                                           "AND updated_at >= NOW() - '1 day'::INTERVAL"
        self.failed_validation_count_query = "SELECT COUNT(*) FROM validation " \
                                             "WHERE status='FAILED' " \
                                             "AND updated_at >= NOW() - '1 day'::INTERVAL"
        self.deadletter_metric_queries = [{
            'Id': 'visible_messages',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/SQS',
                    'MetricName':
                    'ApproximateNumberOfMessagesVisible',
                    'Dimensions': [{
                        'Name':
                        'QueueName',
                        'Value':
                        f'dcp-upload-pre-csum-deadletter-queue-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Average'
            }
        }, {
            'Id': 'received_messages',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/SQS',
                    'MetricName':
                    'NumberOfMessagesReceived',
                    'Dimensions': [{
                        'Name':
                        'QueueName',
                        'Value':
                        f'dcp-upload-pre-csum-deadletter-queue-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Average'
            }
        }]
        self.lambda_error_queries = [{
            'Id': 'upload_api_lambda_errors',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/Lambda',
                    'MetricName':
                    'Errors',
                    'Dimensions': [{
                        'Name': 'FunctionName',
                        'Value': f'upload-api-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Sum'
            }
        }, {
            'Id': 'checksum_daemon_lambda_errors',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/Lambda',
                    'MetricName':
                    'Errors',
                    'Dimensions': [{
                        'Name': 'FunctionName',
                        'Value': f'dcp-upload-csum-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Sum'
            }
        }]
Пример #19
0
class HealthCheck:
    def __init__(self):
        self.env = os.environ['DEPLOYMENT_STAGE']
        self.db = UploadDB()
        logger.debug(
            f"Running a health check for {self.env}. Results will be posted in #upload-service"
        )
        self.webhook = UploadConfig().slack_webhook

        self.stale_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \
                                              "WHERE status='CHECKSUMMING' " \
                                              "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                              "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.stale_validation_job_count_query = "SELECT COUNT(*) FROM validation " \
                                                "WHERE status='VALIDATING' " \
                                                "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.scheduled_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \
                                                  "WHERE status='SCHEDULED' " \
                                                  "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                  "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.scheduled_validation_job_count_query = "SELECT COUNT(*) FROM validation " \
                                                    "WHERE status='SCHEDULED' " \
                                                    "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                    "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.undeleted_areas_count_query = "SELECT COUNT(*) FROM upload_area " \
                                           "WHERE created_at > CURRENT_DATE - interval '4 weeks' " \
                                           "AND status != 'DELETED'"
        self.failed_checksum_count_query = "SELECT COUNT(*) FROM checksum " \
                                           "WHERE status='FAILED' " \
                                           "AND updated_at >= NOW() - '1 day'::INTERVAL"
        self.failed_validation_count_query = "SELECT COUNT(*) FROM validation " \
                                             "WHERE status='FAILED' " \
                                             "AND updated_at >= NOW() - '1 day'::INTERVAL"
        self.deadletter_metric_queries = [{
            'Id': 'visible_messages',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/SQS',
                    'MetricName':
                    'ApproximateNumberOfMessagesVisible',
                    'Dimensions': [{
                        'Name':
                        'QueueName',
                        'Value':
                        f'dcp-upload-pre-csum-deadletter-queue-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Average'
            }
        }, {
            'Id': 'received_messages',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/SQS',
                    'MetricName':
                    'NumberOfMessagesReceived',
                    'Dimensions': [{
                        'Name':
                        'QueueName',
                        'Value':
                        f'dcp-upload-pre-csum-deadletter-queue-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Average'
            }
        }]
        self.lambda_error_queries = [{
            'Id': 'upload_api_lambda_errors',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/Lambda',
                    'MetricName':
                    'Errors',
                    'Dimensions': [{
                        'Name': 'FunctionName',
                        'Value': f'upload-api-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Sum'
            }
        }, {
            'Id': 'checksum_daemon_lambda_errors',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/Lambda',
                    'MetricName':
                    'Errors',
                    'Dimensions': [{
                        'Name': 'FunctionName',
                        'Value': f'dcp-upload-csum-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Sum'
            }
        }]

    def run_upload_service_health_check(self):
        deadletter_queue_info = self.generate_deadletter_queue_status()
        upload_area_info = self.generate_upload_area_status()
        lambda_info = self.generate_lambda_error_status()

        if deadletter_queue_info == upload_area_info == lambda_info == 'GOOD\n':
            color = 'good'
            status_info = "It's 6 o'clock somewhere and all is well"
        else:
            color = 'bad'
            status_info = (f"DEADLETTER_QUEUE: {deadletter_queue_info}" +
                           f"UPLOAD_AREAS: {upload_area_info}" +
                           f"LAMBDAS: {lambda_info}")

        attachments = [{
            "title": f"Health Check Report for {self.env}:",
            "color": color,
            "text": status_info
        }]

        self.post_message_to_url(self.webhook, {"attachments": attachments})

    def generate_deadletter_queue_status(self):
        deadletter_results = self._query_cloudwatch_metrics_for_past_day(
            self.deadletter_metric_queries)
        if deadletter_results['received_messages'] == 0:
            deadletter_queue_status = "GOOD\n"
        else:
            deadletter_queue_status = f"{deadletter_results['visible_messages']} in queue, " \
                                      f"{deadletter_results['received_messages']} added in past 24 hrs\n"
        return deadletter_queue_status

    def generate_lambda_error_status(self):
        lambda_error_results = self._query_cloudwatch_metrics_for_past_day(
            self.lambda_error_queries)
        if lambda_error_results['upload_api_lambda_errors'] == 0 and \
                lambda_error_results['checksum_daemon_lambda_errors'] == 0:
            lambda_error_status = 'GOOD\n'
        else:
            lambda_error_status = f"{lambda_error_results['upload_api_lambda_errors']} errors for Upload API, " \
                                  f"{lambda_error_results['checksum_daemon_lambda_errors']} errors for csum daemon\n"
        return lambda_error_status

    def generate_upload_area_status(self):
        undeleted_upload_area_count = self._query_db_and_return_first_row(
            self.undeleted_areas_count_query)
        stale_checksumming_areas = self._query_db_and_return_first_row(
            self.stale_checksum_job_count_query)
        stale_validating_areas = self._query_db_and_return_first_row(
            self.stale_validation_job_count_query)
        scheduled_checksum_areas = self._query_db_and_return_first_row(
            self.scheduled_checksum_job_count_query)
        scheduled_validation_areas = self._query_db_and_return_first_row(
            self.scheduled_validation_job_count_query)
        failed_checksum_count = self._query_db_and_return_first_row(
            self.failed_checksum_count_query)
        failed_validation_count = self._query_db_and_return_first_row(
            self.failed_validation_count_query)
        if (stale_checksumming_areas + stale_validating_areas +
                scheduled_checksum_areas + scheduled_validation_areas +
                failed_checksum_count + failed_validation_count) == 0:
            upload_area_status = 'GOOD\n'
        else:
            upload_area_status = f"{undeleted_upload_area_count} undeleted areas, {stale_checksumming_areas}" \
                                 f" stuck in checksumming, {stale_validating_areas} stuck in validation \n" \
                                 f"{scheduled_checksum_areas} files scheduled for checksumming, " \
                                 f"{scheduled_validation_areas} files scheduled for validation (for over 2 hours)\n" \
                                 f"{failed_checksum_count} files failed batch checksumming in last day\n" \
                                 f"{failed_validation_count} files failed batch validation in last day\n"
        return upload_area_status

    def post_message_to_url(self, url, message):
        body = json.dumps(message)
        headers = {'Content-Type': 'application/json'}
        requests.post(url=url, data=body, headers=headers)

    def _query_cloudwatch_metrics_for_past_day(self, metric_data_queries):
        now = datetime.utcnow()
        yesterday = now - timedelta(hours=24)
        response = client.get_metric_data(
            MetricDataQueries=metric_data_queries,
            StartTime=yesterday,
            EndTime=now)
        results = {}
        for info in response['MetricDataResults']:
            if len(info['Values']) > 0:
                results[info['Id']] = int(info['Values'][0])
            else:
                results[info['Id']] = "no value returned"
        return results

    def _query_db_and_return_first_row(self, query):
        query_result = self.db.run_query(query)
        rows = query_result.fetchall()
        if len(rows) > 0:
            results = rows[0][0]
            return results
Пример #20
0
class BatchWatcher:
    def __init__(self):
        self.api_key = os.environ["INGEST_API_KEY"]
        self.deployment_stage = os.environ["DEPLOYMENT_STAGE"]
        self.api_host = os.environ["API_HOST"]
        self.batch_client = boto3.client("batch")
        self.ec2_client = boto3.client('ec2')
        self.lambda_client = boto3.client('lambda')
        self.db = UploadDB()

    def run(self):
        incomplete_checksum_jobs, incomplete_validation_jobs = self.find_incomplete_batch_jobs(
        )
        logger.info(
            f"Found {len(incomplete_checksum_jobs)} incomplete checksum jobs utilizing batch"
        )
        logger.info(
            f"Found {len(incomplete_validation_jobs)} incomplete validation jobs utilizing batch"
        )
        incomplete_jobs = incomplete_checksum_jobs + incomplete_validation_jobs
        kill_instances = self.should_instances_be_killed(incomplete_jobs)
        if kill_instances:
            self.find_and_kill_deployment_batch_instances()
            # Re fetch incomplete checksum and validation jobs after killing instances to catch newly scheduled
            incomplete_checksum_jobs, incomplete_validation_jobs = self.find_incomplete_batch_jobs(
            )
            for row in incomplete_validation_jobs:
                self.schedule_job(row, "validation")
            for row in incomplete_checksum_jobs:
                self.schedule_job(row, "checksum")
            logger.info(
                f"Finished rescheduling {len(incomplete_validation_jobs)} validation jobs and \
                {len(incomplete_checksum_jobs)} checksum jobs")
        else:
            logger.info(
                "No new failed jobs detected in batch. Jobs will continue untouched."
            )

    def should_instances_be_killed(self, rows):
        kill_instances = False
        for row in rows:
            db_id = row["id"]
            job_id = row["job_id"]
            file_id = row["file_id"]
            status = self._get_job_status(job_id)
            if status == "FAILED":
                logger.info(
                    f"database record id {db_id} for file {file_id} represents a failed batch job. \
                    Time to kill instances.")
                kill_instances = True
                break
        return kill_instances

    @retry_on_aws_too_many_requests
    def _get_job_status(self, job_id):
        response = self.batch_client.describe_jobs(jobs=[job_id])
        jobs = response.get("jobs")
        if jobs and len(jobs):
            status = jobs[0]["status"]
            return status

    def find_incomplete_batch_jobs(self):
        validation_results = self.db.run_query(
            "SELECT * from validation "
            "WHERE status = 'SCHEDULED' or status = 'VALIDATING';")
        validation_rows = validation_results.fetchall()
        checksum_results = self.db.run_query(
            "SELECT * from checksum "
            "WHERE(status='SCHEDULED' or status = 'CHECKSUMMING') "
            "and job_id is not null;")
        checksum_rows = checksum_results.fetchall()
        return checksum_rows, validation_rows

    def find_and_kill_deployment_batch_instances(self):
        instance_ids = []
        key_name = f"hca-upload-{self.deployment_stage}"
        reservations = self.ec2_client.describe_instances(
            Filters=[{
                'Name': 'key-name',
                'Values': [key_name]
            }, {
                'Name': 'instance-state-name',
                'Values': ["running"]
            }])

        instance_groups = [
            x["Instances"] for x in reservations["Reservations"]
        ]
        for group in instance_groups:
            for instance in group:
                instance_ids.append(instance['InstanceId'])
        if len(instance_ids):
            logger.info(
                f"Killing instances associated with key {key_name} and ec2 ids {str(instance_ids)}"
            )
            self.ec2_client.terminate_instances(InstanceIds=instance_ids)
        return instance_ids

    def schedule_job(self, row, table_name):
        db_id = row["id"]
        file_id = row["file_id"]
        file_id_split = file_id.split("/")
        upload_area_id = file_id_split[0]
        file_name = file_id_split[1]
        if table_name == "checksum":
            self.invoke_checksum_lambda(file_id)
        elif table_name == "validation":
            docker_image = row["docker_image"]
            # Multiple validation attempts on a file should point to the same original validation id
            original_validation_id = row["original_validation_id"]
            if not original_validation_id:
                # If there is no original_validation_id,
                # set the db id of first validation attempt as original_validation_id.
                original_validation_id = db_id
            self.schedule_validation_job(upload_area_id, file_name,
                                         docker_image, original_validation_id)
        logger.info(
            f"Marking {table_name} record id {db_id} for file {file_id} as failed."
        )
        self.db.run_query_with_params(
            f"UPDATE {table_name} SET status = 'FAILED' \
            WHERE id = %s;", (db_id))

    def schedule_validation_job(self, upload_area_id, file_name, docker_image,
                                original_validation_id):
        headers = {'Api-Key': self.api_key}
        message = {
            "validator_image": docker_image,
            "original_validation_id": original_validation_id
        }
        response = requests.put(self.api_host, headers=headers, json=message)
        if response.status_code == requests.codes.ok:
            logger.info(
                f"scheduled {upload_area_id}/{file_name} for validation")
        else:
            raise UploadException(
                f"Failed to schedule {upload_area_id}/{file_name} for validation"
            )

    def invoke_checksum_lambda(self, file_id):
        payload = {
            'Records': [{
                'eventName': 'ObjectCreated:Put',
                "s3": {
                    "bucket": {
                        "name":
                        f"org-humancellatlas-upload-{self.deployment_stage}"
                    },
                    "object": {
                        "key": file_id
                    }
                }
            }]
        }
        self.lambda_client.invoke(
            FunctionName=f"dcp-upload-csum-{self.deployment_stage}",
            InvocationType='Event',
            Payload=json.dumps(payload).encode())
        logger.info(f"scheduled {file_id} for checksumming")