def test_validating_status_file_validation(self, mock_format_and_send_notification): validation_id = str(uuid.uuid4()) orig_val_id = str(uuid.uuid4()) area_id = self._create_area() s3obj = self.mock_upload_file_to_s3(area_id, 'foo.json') upload_area = UploadArea(area_id) uploaded_file = UploadedFile(upload_area, s3object=s3obj) validation_event = ValidationEvent(file_ids=[uploaded_file.db_id], validation_id=validation_id, job_id='12345', status="SCHEDULED", docker_image="test_docker_image", original_validation_id=orig_val_id) validation_event.create_record() data = { "status": "VALIDATING", "job_id": validation_event.job_id, "payload": uploaded_file.info() } response = self.client.post(f"/v1/area/{area_id}/update_validation/{validation_id}", headers=self.authentication_header, data=json.dumps(data)) self.assertEqual(204, response.status_code) record = UploadDB().get_pg_record("validation", validation_id) self.assertEqual("test_docker_image", record["docker_image"]) self.assertEqual(validation_id, record["id"]) self.assertEqual(orig_val_id, record["original_validation_id"]) self.assertEqual("VALIDATING", record["status"]) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_started_at")))) self.assertEqual(None, record["validation_ended_at"]) self.assertEqual(None, record.get("results")) response = self.client.get(f"/v1/area/{area_id}/foo.json/validate") validation_status = response.get_json()['validation_status'] self.assertEqual(validation_status, "VALIDATING") mock_format_and_send_notification.assert_not_called()
def test_schedule_validation__for_multiple_files__is_successful(self): area_id = self._create_area() self.mock_upload_file_to_s3(area_id, 'foo.json') self.mock_upload_file_to_s3(area_id, 'foo2.json') payload = { 'validator_image': "humancellatlas/upload-validator-example", 'files': ['foo.json', 'foo2.json'] } response = self.client.put( f"/v1/area/{area_id}/validate", headers=self.authentication_header, json=payload ) self.assertEqual(response.status_code, 200) validation_id = response.json['validation_id'] validation_record = UploadDB().get_pg_record("validation", validation_id) self.assertEqual(validation_record['status'], "SCHEDULING_QUEUED") validation_files_records = UploadDB().get_pg_records("validation_files", validation_id, column='validation_id') file_one_record = UploadDB().get_pg_record("file", f"{area_id}/foo.json", "s3_key") file_two_record = UploadDB().get_pg_record("file", f"{area_id}/foo2.json", "s3_key") self.assertEqual(len(validation_files_records), 2) validation_file_db_ids = [record['file_id'] for record in validation_files_records] self.assertEqual(file_one_record['id'] in validation_file_db_ids, True) self.assertEqual(file_two_record['id'] in validation_file_db_ids, True)
def __init__(self): self.api_key = os.environ["INGEST_API_KEY"] self.deployment_stage = os.environ["DEPLOYMENT_STAGE"] self.api_host = os.environ["API_HOST"] self.batch_client = boto3.client("batch") self.ec2_client = boto3.client('ec2') self.lambda_client = boto3.client('lambda') self.db = UploadDB()
def test_update_event_with_validation_event(self, mock_format_and_send_notification): validation_id = str(uuid.uuid4()) area_id = self._create_area() s3obj = self.mock_upload_file_to_s3(area_id, 'foo.json') upload_area = UploadArea(area_id) uploaded_file = UploadedFile(upload_area, s3object=s3obj) validation_event = ValidationEvent(file_ids=[uploaded_file.db_id], validation_id=validation_id, job_id='12345', status="SCHEDULED") validation_event.create_record() validation_event.status = "VALIDATING" response = update_event(validation_event, uploaded_file.info(), self.client) self.assertEqual(204, response.status_code) record = UploadDB().get_pg_record("validation", validation_id) self.assertEqual("VALIDATING", record["status"]) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_started_at")))) self.assertEqual(None, record["validation_ended_at"]) self.assertEqual(None, record.get("results")) validation_event.status = "VALIDATED" response = update_event(validation_event, uploaded_file.info(), self.client) self.assertEqual(204, response.status_code) record = UploadDB().get_pg_record("validation", validation_id) self.assertEqual("VALIDATED", record["status"]) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_started_at")))) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_ended_at")))) self.assertEqual(uploaded_file.info(), record.get("results"))
def setUp(self): super().setUp() self.area_uuid = str(uuid.uuid4()) self.upload_area = UploadArea(self.area_uuid) self.db = UploadDB() self.db.create_pg_record( "upload_area", { "uuid": self.area_uuid, "status": "UNLOCKED", "bucket_name": self.upload_config.bucket_name })
def test_add_to_validation_sqs__adds_correct_event_to_queue(self): uploaded_file = UploadedFile.create( upload_area=self.upload_area, name="file2", content_type="application/octet-stream; dcp-type=data", data="file2_content") validation_scheduler = ValidationScheduler(self.upload_area_id, [uploaded_file]) validation_uuid = validation_scheduler.add_to_validation_sqs( ["filename123"], "test_docker_image", {"variable": "variable"}, "123456") message = self.sqs.meta.client.receive_message( QueueUrl='test_validation_q_url') message_body = json.loads(message['Messages'][0]['Body']) record = UploadDB().get_pg_record("validation", validation_uuid, column='id') self.assertEqual(message_body["filenames"], ["filename123"]) self.assertEqual(message_body["validation_id"], validation_uuid) self.assertEqual(message_body["validator_docker_image"], "test_docker_image") self.assertEqual(message_body["environment"], {"variable": "variable"}) self.assertEqual(message_body["orig_validation_id"], "123456") self.assertEqual(message_body["upload_area_uuid"], uploaded_file.upload_area.uuid) self.assertEqual(record["status"], "SCHEDULING_QUEUED")
def test_validated_status_file_validation(self, mock_format_and_send_notification): validation_id = str(uuid.uuid4()) area_id = self._create_area() s3obj = self.mock_upload_file_to_s3(area_id, 'foo.json') upload_area = UploadArea(area_id) uploaded_file = UploadedFile(upload_area, s3object=s3obj) validation_event = ValidationEvent(file_ids=[uploaded_file.db_id], validation_id=validation_id, job_id='12345', status="SCHEDULED", docker_image="test_docker_image") validation_event.create_record() data = { "status": "VALIDATING", "job_id": validation_event.job_id, "payload": uploaded_file.info() } response = self.client.post(f"/v1/area/{area_id}/update_validation/{validation_id}", headers=self.authentication_header, data=json.dumps(data)) data = { "status": "VALIDATED", "job_id": validation_event.job_id, "payload": uploaded_file.info() } response = self.client.post(f"/v1/area/{area_id}/update_validation/{validation_id}", headers=self.authentication_header, data=json.dumps(data)) self.assertEqual(204, response.status_code) mock_format_and_send_notification.assert_called_once_with({ 'upload_area_id': area_id, 'name': 'foo.json', 'size': 3, 'last_modified': s3obj.last_modified.isoformat(), 'content_type': "application/json", 'url': f"s3://{self.upload_config.bucket_name}/{area_id}/foo.json", 'checksums': {'s3_etag': '1', 'sha1': '2', 'sha256': '3', 'crc32c': '4'} }) record = UploadDB().get_pg_record("validation", validation_id) self.assertEqual("VALIDATED", record["status"]) self.assertEqual("test_docker_image", record["docker_image"]) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_started_at")))) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("validation_ended_at")))) self.assertEqual(uploaded_file.info(), record.get("results")) response = self.client.get(f"/v1/area/{area_id}/foo.json/validate") validation_status = response.get_json()['validation_status'] self.assertEqual(validation_status, "VALIDATED")
def test_update_event_with_checksum_event(self, mock_format_and_send_notification): checksum_id = str(uuid.uuid4()) area_uuid = self._create_area() s3obj = self.mock_upload_file_to_s3(area_uuid, 'foo.json') upload_area = UploadArea(area_uuid) uploaded_file = UploadedFile(upload_area, s3object=s3obj) checksum_event = ChecksumEvent(file_id=uploaded_file.db_id, checksum_id=checksum_id, job_id='12345', status="SCHEDULED") checksum_event.create_record() checksum_event.status = "CHECKSUMMING" response = update_event(checksum_event, uploaded_file.info(), self.client) self.assertEqual(204, response.status_code) record = UploadDB().get_pg_record("checksum", checksum_id) self.assertEqual("CHECKSUMMING", record["status"]) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("checksum_started_at")))) self.assertEqual(None, record["checksum_ended_at"]) checksum_event.status = "CHECKSUMMED" response = update_event(checksum_event, uploaded_file.info(), self.client) self.assertEqual(204, response.status_code) record = UploadDB().get_pg_record("checksum", checksum_id) self.assertEqual("CHECKSUMMED", record["status"]) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("checksum_started_at")))) self.assertEqual("<class 'datetime.datetime'>", str(type(record.get("checksum_ended_at"))))
def health(): """ This api endpoint is invoked by the dcp wide status monitoring system. This function checks the health of underlying api gateway and db infrastructure. Running a simple query confirms that ecs pgbouncer is up running and talking to rds. """ db_health_check_query = "SELECT count(*) from upload_area;" UploadDB().run_query(db_health_check_query) return requests.codes.ok
class TestDatabase(UploadTestCaseUsingMockAWS): def setUp(self): super().setUp() self.area_uuid = str(uuid.uuid4()) self.upload_area = UploadArea(self.area_uuid) self.db = UploadDB() self.db.create_pg_record( "upload_area", { "uuid": self.area_uuid, "status": "UNLOCKED", "bucket_name": self.upload_config.bucket_name }) def test_get_pg_record(self): result = self.db.get_pg_record("upload_area", self.area_uuid, column='uuid') self.assertEqual(result["uuid"], self.area_uuid) self.assertEqual(result["bucket_name"], self.upload_config.bucket_name) self.assertEqual(result["status"], "UNLOCKED") def test_update_pg_record(self): before = self.db.get_pg_record("upload_area", self.area_uuid, column='uuid') self.assertEqual(before["status"], "UNLOCKED") self.db.update_pg_record("upload_area", { "uuid": self.area_uuid, "status": "LOCKED", "bucket_name": self.upload_config.bucket_name }, column='uuid') after = self.db.get_pg_record("upload_area", self.area_uuid, column='uuid') self.assertEqual(after["uuid"], self.area_uuid) self.assertEqual(after["bucket_name"], self.upload_config.bucket_name) self.assertEqual(after["status"], "LOCKED") def test_get_pg_records(self): results = self.db.get_pg_records("upload_area", self.area_uuid, column='uuid') self.assertEqual(results[0]["uuid"], self.area_uuid) self.assertEqual(results[0]["bucket_name"], self.upload_config.bucket_name) self.assertEqual(results[0]["status"], "UNLOCKED")
def test_add_upload_area_to_delete_sqs(self): area_uuid = self._create_area() UploadArea(area_uuid).add_upload_area_to_delete_sqs() message = self.sqs.meta.client.receive_message( QueueUrl='delete_sqs_url') message_body = json.loads(message['Messages'][0]['Body']) self.assertEqual(message_body['area_uuid'], area_uuid) record = UploadDB().get_pg_record("upload_area", area_uuid, column='uuid') self.assertEqual(record['status'], "DELETION_QUEUED")
def test_upload_area_delete_over_timeout(self, mock_retrieve_lambda_timeout): area_uuid = self._create_area() obj = self.upload_bucket.Object(f'{area_uuid}/test_file') obj.put(Body="foo") mock_retrieve_lambda_timeout.return_value = 0 area = UploadArea(area_uuid) area.delete() record = UploadDB().get_pg_record("upload_area", area_uuid, column='uuid') self.assertEqual("DELETION_QUEUED", record["status"])
def test_delete_with_id_of_real_non_empty_upload_area(self): area_uuid = self._create_area() obj = self.upload_bucket.Object(f'{area_uuid}/test_file') obj.put(Body="foo") response = self.client.delete(f"/v1/area/{area_uuid}", headers=self.authentication_header) self.assertEqual(202, response.status_code) record = UploadDB().get_pg_record("upload_area", area_uuid, column='uuid') self.assertEqual("DELETION_QUEUED", record["status"])
def test_locking_of_upload_area(self): area_uuid = self._create_area() record = UploadDB().get_pg_record("upload_area", area_uuid, column='uuid') self.assertEqual("UNLOCKED", record["status"]) response = self.client.post(f"/v1/area/{area_uuid}/lock", headers=self.authentication_header) self.assertEqual(204, response.status_code) record = UploadDB().get_pg_record("upload_area", area_uuid, column='uuid') self.assertEqual("LOCKED", record["status"]) response = self.client.delete(f"/v1/area/{area_uuid}/lock", headers=self.authentication_header) self.assertEqual(204, response.status_code) record = UploadDB().get_pg_record("upload_area", area_uuid, column='uuid') self.assertEqual("UNLOCKED", record["status"])
def test_create_with_unused_upload_area_uuid(self): area_uuid = str(uuid.uuid4()) response = self.client.post(f"/v1/area/{area_uuid}", headers=self.authentication_header) self.assertEqual(201, response.status_code) body = json.loads(response.data) self.assertEqual( {'uri': f"s3://{self.upload_config.bucket_name}/{area_uuid}/"}, body) record = UploadDB().get_pg_record("upload_area", area_uuid, column='uuid') self.assertEqual(area_uuid, record["uuid"]) self.assertEqual(self.upload_config.bucket_name, record["bucket_name"]) self.assertEqual("UNLOCKED", record["status"])
def test_schedule_validation__with_original_validation_id__retains_original_validation_id(self): area_id = self._create_area() self.mock_upload_file_to_s3(area_id, 'foo.json') self.mock_upload_file_to_s3(area_id, 'foo2.json') payload = { 'validator_image': "humancellatlas/upload-validator-example", 'files': ['foo.json', 'foo2.json'], 'original_validation_id': '123456' } response = self.client.put( f"/v1/area/{area_id}/validate", headers=self.authentication_header, json=payload ) self.assertEqual(200, response.status_code) validation_id = response.json['validation_id'] validation_record = UploadDB().get_pg_record("validation", validation_id) self.assertEqual(validation_record['status'], "SCHEDULING_QUEUED") self.assertEqual(validation_record['original_validation_id'], "123456")
def test_format_and_send_notification(self, mock_send_notification): area_uuid = str(uuid.uuid4()) upload_area = UploadArea(area_uuid) upload_area.update_or_create() upload_area._db_load() file = upload_area.store_file("test_file_name", "test_file_content", "application/json; dcp-type=data") ingest_notifier = IngestNotifier("file_uploaded", file_id=file.db_id) test_payload = { 'names': "[test_file_name]", 'upload_area_id': area_uuid } notification_id = ingest_notifier.format_and_send_notification( test_payload) record = UploadDB().get_pg_record("notification", notification_id, column="id") self.assertEqual(record['status'], "DELIVERED") self.assertEqual(record['file_id'], file.db_id) self.assertEqual(record['payload'], test_payload)
def __init__(self): self.env = os.environ['DEPLOYMENT_STAGE'] self.db = UploadDB() logger.debug( f"Running a health check for {self.env}. Results will be posted in #upload-service" ) self.webhook = UploadConfig().slack_webhook self.stale_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='CHECKSUMMING' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.stale_validation_job_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='VALIDATING' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.scheduled_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='SCHEDULED' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.scheduled_validation_job_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='SCHEDULED' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.undeleted_areas_count_query = "SELECT COUNT(*) FROM upload_area " \ "WHERE created_at > CURRENT_DATE - interval '4 weeks' " \ "AND status != 'DELETED'" self.failed_checksum_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='FAILED' " \ "AND updated_at >= NOW() - '1 day'::INTERVAL" self.failed_validation_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='FAILED' " \ "AND updated_at >= NOW() - '1 day'::INTERVAL" self.deadletter_metric_queries = [{ 'Id': 'visible_messages', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/SQS', 'MetricName': 'ApproximateNumberOfMessagesVisible', 'Dimensions': [{ 'Name': 'QueueName', 'Value': f'dcp-upload-pre-csum-deadletter-queue-{self.env}' }] }, 'Period': 90000, 'Stat': 'Average' } }, { 'Id': 'received_messages', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/SQS', 'MetricName': 'NumberOfMessagesReceived', 'Dimensions': [{ 'Name': 'QueueName', 'Value': f'dcp-upload-pre-csum-deadletter-queue-{self.env}' }] }, 'Period': 90000, 'Stat': 'Average' } }] self.lambda_error_queries = [{ 'Id': 'upload_api_lambda_errors', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/Lambda', 'MetricName': 'Errors', 'Dimensions': [{ 'Name': 'FunctionName', 'Value': f'upload-api-{self.env}' }] }, 'Period': 90000, 'Stat': 'Sum' } }, { 'Id': 'checksum_daemon_lambda_errors', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/Lambda', 'MetricName': 'Errors', 'Dimensions': [{ 'Name': 'FunctionName', 'Value': f'dcp-upload-csum-{self.env}' }] }, 'Period': 90000, 'Stat': 'Sum' } }]
class HealthCheck: def __init__(self): self.env = os.environ['DEPLOYMENT_STAGE'] self.db = UploadDB() logger.debug( f"Running a health check for {self.env}. Results will be posted in #upload-service" ) self.webhook = UploadConfig().slack_webhook self.stale_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='CHECKSUMMING' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.stale_validation_job_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='VALIDATING' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.scheduled_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='SCHEDULED' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.scheduled_validation_job_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='SCHEDULED' " \ "AND created_at > CURRENT_DATE - interval '4 weeks' " \ "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'" self.undeleted_areas_count_query = "SELECT COUNT(*) FROM upload_area " \ "WHERE created_at > CURRENT_DATE - interval '4 weeks' " \ "AND status != 'DELETED'" self.failed_checksum_count_query = "SELECT COUNT(*) FROM checksum " \ "WHERE status='FAILED' " \ "AND updated_at >= NOW() - '1 day'::INTERVAL" self.failed_validation_count_query = "SELECT COUNT(*) FROM validation " \ "WHERE status='FAILED' " \ "AND updated_at >= NOW() - '1 day'::INTERVAL" self.deadletter_metric_queries = [{ 'Id': 'visible_messages', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/SQS', 'MetricName': 'ApproximateNumberOfMessagesVisible', 'Dimensions': [{ 'Name': 'QueueName', 'Value': f'dcp-upload-pre-csum-deadletter-queue-{self.env}' }] }, 'Period': 90000, 'Stat': 'Average' } }, { 'Id': 'received_messages', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/SQS', 'MetricName': 'NumberOfMessagesReceived', 'Dimensions': [{ 'Name': 'QueueName', 'Value': f'dcp-upload-pre-csum-deadletter-queue-{self.env}' }] }, 'Period': 90000, 'Stat': 'Average' } }] self.lambda_error_queries = [{ 'Id': 'upload_api_lambda_errors', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/Lambda', 'MetricName': 'Errors', 'Dimensions': [{ 'Name': 'FunctionName', 'Value': f'upload-api-{self.env}' }] }, 'Period': 90000, 'Stat': 'Sum' } }, { 'Id': 'checksum_daemon_lambda_errors', 'MetricStat': { 'Metric': { 'Namespace': 'AWS/Lambda', 'MetricName': 'Errors', 'Dimensions': [{ 'Name': 'FunctionName', 'Value': f'dcp-upload-csum-{self.env}' }] }, 'Period': 90000, 'Stat': 'Sum' } }] def run_upload_service_health_check(self): deadletter_queue_info = self.generate_deadletter_queue_status() upload_area_info = self.generate_upload_area_status() lambda_info = self.generate_lambda_error_status() if deadletter_queue_info == upload_area_info == lambda_info == 'GOOD\n': color = 'good' status_info = "It's 6 o'clock somewhere and all is well" else: color = 'bad' status_info = (f"DEADLETTER_QUEUE: {deadletter_queue_info}" + f"UPLOAD_AREAS: {upload_area_info}" + f"LAMBDAS: {lambda_info}") attachments = [{ "title": f"Health Check Report for {self.env}:", "color": color, "text": status_info }] self.post_message_to_url(self.webhook, {"attachments": attachments}) def generate_deadletter_queue_status(self): deadletter_results = self._query_cloudwatch_metrics_for_past_day( self.deadletter_metric_queries) if deadletter_results['received_messages'] == 0: deadletter_queue_status = "GOOD\n" else: deadletter_queue_status = f"{deadletter_results['visible_messages']} in queue, " \ f"{deadletter_results['received_messages']} added in past 24 hrs\n" return deadletter_queue_status def generate_lambda_error_status(self): lambda_error_results = self._query_cloudwatch_metrics_for_past_day( self.lambda_error_queries) if lambda_error_results['upload_api_lambda_errors'] == 0 and \ lambda_error_results['checksum_daemon_lambda_errors'] == 0: lambda_error_status = 'GOOD\n' else: lambda_error_status = f"{lambda_error_results['upload_api_lambda_errors']} errors for Upload API, " \ f"{lambda_error_results['checksum_daemon_lambda_errors']} errors for csum daemon\n" return lambda_error_status def generate_upload_area_status(self): undeleted_upload_area_count = self._query_db_and_return_first_row( self.undeleted_areas_count_query) stale_checksumming_areas = self._query_db_and_return_first_row( self.stale_checksum_job_count_query) stale_validating_areas = self._query_db_and_return_first_row( self.stale_validation_job_count_query) scheduled_checksum_areas = self._query_db_and_return_first_row( self.scheduled_checksum_job_count_query) scheduled_validation_areas = self._query_db_and_return_first_row( self.scheduled_validation_job_count_query) failed_checksum_count = self._query_db_and_return_first_row( self.failed_checksum_count_query) failed_validation_count = self._query_db_and_return_first_row( self.failed_validation_count_query) if (stale_checksumming_areas + stale_validating_areas + scheduled_checksum_areas + scheduled_validation_areas + failed_checksum_count + failed_validation_count) == 0: upload_area_status = 'GOOD\n' else: upload_area_status = f"{undeleted_upload_area_count} undeleted areas, {stale_checksumming_areas}" \ f" stuck in checksumming, {stale_validating_areas} stuck in validation \n" \ f"{scheduled_checksum_areas} files scheduled for checksumming, " \ f"{scheduled_validation_areas} files scheduled for validation (for over 2 hours)\n" \ f"{failed_checksum_count} files failed batch checksumming in last day\n" \ f"{failed_validation_count} files failed batch validation in last day\n" return upload_area_status def post_message_to_url(self, url, message): body = json.dumps(message) headers = {'Content-Type': 'application/json'} requests.post(url=url, data=body, headers=headers) def _query_cloudwatch_metrics_for_past_day(self, metric_data_queries): now = datetime.utcnow() yesterday = now - timedelta(hours=24) response = client.get_metric_data( MetricDataQueries=metric_data_queries, StartTime=yesterday, EndTime=now) results = {} for info in response['MetricDataResults']: if len(info['Values']) > 0: results[info['Id']] = int(info['Values'][0]) else: results[info['Id']] = "no value returned" return results def _query_db_and_return_first_row(self, query): query_result = self.db.run_query(query) rows = query_result.fetchall() if len(rows) > 0: results = rows[0][0] return results
class BatchWatcher: def __init__(self): self.api_key = os.environ["INGEST_API_KEY"] self.deployment_stage = os.environ["DEPLOYMENT_STAGE"] self.api_host = os.environ["API_HOST"] self.batch_client = boto3.client("batch") self.ec2_client = boto3.client('ec2') self.lambda_client = boto3.client('lambda') self.db = UploadDB() def run(self): incomplete_checksum_jobs, incomplete_validation_jobs = self.find_incomplete_batch_jobs( ) logger.info( f"Found {len(incomplete_checksum_jobs)} incomplete checksum jobs utilizing batch" ) logger.info( f"Found {len(incomplete_validation_jobs)} incomplete validation jobs utilizing batch" ) incomplete_jobs = incomplete_checksum_jobs + incomplete_validation_jobs kill_instances = self.should_instances_be_killed(incomplete_jobs) if kill_instances: self.find_and_kill_deployment_batch_instances() # Re fetch incomplete checksum and validation jobs after killing instances to catch newly scheduled incomplete_checksum_jobs, incomplete_validation_jobs = self.find_incomplete_batch_jobs( ) for row in incomplete_validation_jobs: self.schedule_job(row, "validation") for row in incomplete_checksum_jobs: self.schedule_job(row, "checksum") logger.info( f"Finished rescheduling {len(incomplete_validation_jobs)} validation jobs and \ {len(incomplete_checksum_jobs)} checksum jobs") else: logger.info( "No new failed jobs detected in batch. Jobs will continue untouched." ) def should_instances_be_killed(self, rows): kill_instances = False for row in rows: db_id = row["id"] job_id = row["job_id"] file_id = row["file_id"] status = self._get_job_status(job_id) if status == "FAILED": logger.info( f"database record id {db_id} for file {file_id} represents a failed batch job. \ Time to kill instances.") kill_instances = True break return kill_instances @retry_on_aws_too_many_requests def _get_job_status(self, job_id): response = self.batch_client.describe_jobs(jobs=[job_id]) jobs = response.get("jobs") if jobs and len(jobs): status = jobs[0]["status"] return status def find_incomplete_batch_jobs(self): validation_results = self.db.run_query( "SELECT * from validation " "WHERE status = 'SCHEDULED' or status = 'VALIDATING';") validation_rows = validation_results.fetchall() checksum_results = self.db.run_query( "SELECT * from checksum " "WHERE(status='SCHEDULED' or status = 'CHECKSUMMING') " "and job_id is not null;") checksum_rows = checksum_results.fetchall() return checksum_rows, validation_rows def find_and_kill_deployment_batch_instances(self): instance_ids = [] key_name = f"hca-upload-{self.deployment_stage}" reservations = self.ec2_client.describe_instances( Filters=[{ 'Name': 'key-name', 'Values': [key_name] }, { 'Name': 'instance-state-name', 'Values': ["running"] }]) instance_groups = [ x["Instances"] for x in reservations["Reservations"] ] for group in instance_groups: for instance in group: instance_ids.append(instance['InstanceId']) if len(instance_ids): logger.info( f"Killing instances associated with key {key_name} and ec2 ids {str(instance_ids)}" ) self.ec2_client.terminate_instances(InstanceIds=instance_ids) return instance_ids def schedule_job(self, row, table_name): db_id = row["id"] file_id = row["file_id"] file_id_split = file_id.split("/") upload_area_id = file_id_split[0] file_name = file_id_split[1] if table_name == "checksum": self.invoke_checksum_lambda(file_id) elif table_name == "validation": docker_image = row["docker_image"] # Multiple validation attempts on a file should point to the same original validation id original_validation_id = row["original_validation_id"] if not original_validation_id: # If there is no original_validation_id, # set the db id of first validation attempt as original_validation_id. original_validation_id = db_id self.schedule_validation_job(upload_area_id, file_name, docker_image, original_validation_id) logger.info( f"Marking {table_name} record id {db_id} for file {file_id} as failed." ) self.db.run_query_with_params( f"UPDATE {table_name} SET status = 'FAILED' \ WHERE id = %s;", (db_id)) def schedule_validation_job(self, upload_area_id, file_name, docker_image, original_validation_id): headers = {'Api-Key': self.api_key} message = { "validator_image": docker_image, "original_validation_id": original_validation_id } response = requests.put(self.api_host, headers=headers, json=message) if response.status_code == requests.codes.ok: logger.info( f"scheduled {upload_area_id}/{file_name} for validation") else: raise UploadException( f"Failed to schedule {upload_area_id}/{file_name} for validation" ) def invoke_checksum_lambda(self, file_id): payload = { 'Records': [{ 'eventName': 'ObjectCreated:Put', "s3": { "bucket": { "name": f"org-humancellatlas-upload-{self.deployment_stage}" }, "object": { "key": file_id } } }] } self.lambda_client.invoke( FunctionName=f"dcp-upload-csum-{self.deployment_stage}", InvocationType='Event', Payload=json.dumps(payload).encode()) logger.info(f"scheduled {file_id} for checksumming")