예제 #1
0
class HealthCheck:
    def __init__(self):
        self.env = os.environ['DEPLOYMENT_STAGE']
        self.db = UploadDB()
        logger.debug(
            f"Running a health check for {self.env}. Results will be posted in #upload-service"
        )
        self.webhook = UploadConfig().slack_webhook

        self.stale_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \
                                              "WHERE status='CHECKSUMMING' " \
                                              "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                              "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.stale_validation_job_count_query = "SELECT COUNT(*) FROM validation " \
                                                "WHERE status='VALIDATING' " \
                                                "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.scheduled_checksum_job_count_query = "SELECT COUNT(*) FROM checksum " \
                                                  "WHERE status='SCHEDULED' " \
                                                  "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                  "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.scheduled_validation_job_count_query = "SELECT COUNT(*) FROM validation " \
                                                    "WHERE status='SCHEDULED' " \
                                                    "AND created_at > CURRENT_DATE - interval '4 weeks' " \
                                                    "AND updated_at > CURRENT_TIMESTAMP - interval '2 hours'"
        self.undeleted_areas_count_query = "SELECT COUNT(*) FROM upload_area " \
                                           "WHERE created_at > CURRENT_DATE - interval '4 weeks' " \
                                           "AND status != 'DELETED'"
        self.failed_checksum_count_query = "SELECT COUNT(*) FROM checksum " \
                                           "WHERE status='FAILED' " \
                                           "AND updated_at >= NOW() - '1 day'::INTERVAL"
        self.failed_validation_count_query = "SELECT COUNT(*) FROM validation " \
                                             "WHERE status='FAILED' " \
                                             "AND updated_at >= NOW() - '1 day'::INTERVAL"
        self.deadletter_metric_queries = [{
            'Id': 'visible_messages',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/SQS',
                    'MetricName':
                    'ApproximateNumberOfMessagesVisible',
                    'Dimensions': [{
                        'Name':
                        'QueueName',
                        'Value':
                        f'dcp-upload-pre-csum-deadletter-queue-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Average'
            }
        }, {
            'Id': 'received_messages',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/SQS',
                    'MetricName':
                    'NumberOfMessagesReceived',
                    'Dimensions': [{
                        'Name':
                        'QueueName',
                        'Value':
                        f'dcp-upload-pre-csum-deadletter-queue-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Average'
            }
        }]
        self.lambda_error_queries = [{
            'Id': 'upload_api_lambda_errors',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/Lambda',
                    'MetricName':
                    'Errors',
                    'Dimensions': [{
                        'Name': 'FunctionName',
                        'Value': f'upload-api-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Sum'
            }
        }, {
            'Id': 'checksum_daemon_lambda_errors',
            'MetricStat': {
                'Metric': {
                    'Namespace':
                    'AWS/Lambda',
                    'MetricName':
                    'Errors',
                    'Dimensions': [{
                        'Name': 'FunctionName',
                        'Value': f'dcp-upload-csum-{self.env}'
                    }]
                },
                'Period': 90000,
                'Stat': 'Sum'
            }
        }]

    def run_upload_service_health_check(self):
        deadletter_queue_info = self.generate_deadletter_queue_status()
        upload_area_info = self.generate_upload_area_status()
        lambda_info = self.generate_lambda_error_status()

        if deadletter_queue_info == upload_area_info == lambda_info == 'GOOD\n':
            color = 'good'
            status_info = "It's 6 o'clock somewhere and all is well"
        else:
            color = 'bad'
            status_info = (f"DEADLETTER_QUEUE: {deadletter_queue_info}" +
                           f"UPLOAD_AREAS: {upload_area_info}" +
                           f"LAMBDAS: {lambda_info}")

        attachments = [{
            "title": f"Health Check Report for {self.env}:",
            "color": color,
            "text": status_info
        }]

        self.post_message_to_url(self.webhook, {"attachments": attachments})

    def generate_deadletter_queue_status(self):
        deadletter_results = self._query_cloudwatch_metrics_for_past_day(
            self.deadletter_metric_queries)
        if deadletter_results['received_messages'] == 0:
            deadletter_queue_status = "GOOD\n"
        else:
            deadletter_queue_status = f"{deadletter_results['visible_messages']} in queue, " \
                                      f"{deadletter_results['received_messages']} added in past 24 hrs\n"
        return deadletter_queue_status

    def generate_lambda_error_status(self):
        lambda_error_results = self._query_cloudwatch_metrics_for_past_day(
            self.lambda_error_queries)
        if lambda_error_results['upload_api_lambda_errors'] == 0 and \
                lambda_error_results['checksum_daemon_lambda_errors'] == 0:
            lambda_error_status = 'GOOD\n'
        else:
            lambda_error_status = f"{lambda_error_results['upload_api_lambda_errors']} errors for Upload API, " \
                                  f"{lambda_error_results['checksum_daemon_lambda_errors']} errors for csum daemon\n"
        return lambda_error_status

    def generate_upload_area_status(self):
        undeleted_upload_area_count = self._query_db_and_return_first_row(
            self.undeleted_areas_count_query)
        stale_checksumming_areas = self._query_db_and_return_first_row(
            self.stale_checksum_job_count_query)
        stale_validating_areas = self._query_db_and_return_first_row(
            self.stale_validation_job_count_query)
        scheduled_checksum_areas = self._query_db_and_return_first_row(
            self.scheduled_checksum_job_count_query)
        scheduled_validation_areas = self._query_db_and_return_first_row(
            self.scheduled_validation_job_count_query)
        failed_checksum_count = self._query_db_and_return_first_row(
            self.failed_checksum_count_query)
        failed_validation_count = self._query_db_and_return_first_row(
            self.failed_validation_count_query)
        if (stale_checksumming_areas + stale_validating_areas +
                scheduled_checksum_areas + scheduled_validation_areas +
                failed_checksum_count + failed_validation_count) == 0:
            upload_area_status = 'GOOD\n'
        else:
            upload_area_status = f"{undeleted_upload_area_count} undeleted areas, {stale_checksumming_areas}" \
                                 f" stuck in checksumming, {stale_validating_areas} stuck in validation \n" \
                                 f"{scheduled_checksum_areas} files scheduled for checksumming, " \
                                 f"{scheduled_validation_areas} files scheduled for validation (for over 2 hours)\n" \
                                 f"{failed_checksum_count} files failed batch checksumming in last day\n" \
                                 f"{failed_validation_count} files failed batch validation in last day\n"
        return upload_area_status

    def post_message_to_url(self, url, message):
        body = json.dumps(message)
        headers = {'Content-Type': 'application/json'}
        requests.post(url=url, data=body, headers=headers)

    def _query_cloudwatch_metrics_for_past_day(self, metric_data_queries):
        now = datetime.utcnow()
        yesterday = now - timedelta(hours=24)
        response = client.get_metric_data(
            MetricDataQueries=metric_data_queries,
            StartTime=yesterday,
            EndTime=now)
        results = {}
        for info in response['MetricDataResults']:
            if len(info['Values']) > 0:
                results[info['Id']] = int(info['Values'][0])
            else:
                results[info['Id']] = "no value returned"
        return results

    def _query_db_and_return_first_row(self, query):
        query_result = self.db.run_query(query)
        rows = query_result.fetchall()
        if len(rows) > 0:
            results = rows[0][0]
            return results
예제 #2
0
class BatchWatcher:
    def __init__(self):
        self.api_key = os.environ["INGEST_API_KEY"]
        self.deployment_stage = os.environ["DEPLOYMENT_STAGE"]
        self.api_host = os.environ["API_HOST"]
        self.batch_client = boto3.client("batch")
        self.ec2_client = boto3.client('ec2')
        self.lambda_client = boto3.client('lambda')
        self.db = UploadDB()

    def run(self):
        incomplete_checksum_jobs, incomplete_validation_jobs = self.find_incomplete_batch_jobs(
        )
        logger.info(
            f"Found {len(incomplete_checksum_jobs)} incomplete checksum jobs utilizing batch"
        )
        logger.info(
            f"Found {len(incomplete_validation_jobs)} incomplete validation jobs utilizing batch"
        )
        incomplete_jobs = incomplete_checksum_jobs + incomplete_validation_jobs
        kill_instances = self.should_instances_be_killed(incomplete_jobs)
        if kill_instances:
            self.find_and_kill_deployment_batch_instances()
            # Re fetch incomplete checksum and validation jobs after killing instances to catch newly scheduled
            incomplete_checksum_jobs, incomplete_validation_jobs = self.find_incomplete_batch_jobs(
            )
            for row in incomplete_validation_jobs:
                self.schedule_job(row, "validation")
            for row in incomplete_checksum_jobs:
                self.schedule_job(row, "checksum")
            logger.info(
                f"Finished rescheduling {len(incomplete_validation_jobs)} validation jobs and \
                {len(incomplete_checksum_jobs)} checksum jobs")
        else:
            logger.info(
                "No new failed jobs detected in batch. Jobs will continue untouched."
            )

    def should_instances_be_killed(self, rows):
        kill_instances = False
        for row in rows:
            db_id = row["id"]
            job_id = row["job_id"]
            file_id = row["file_id"]
            status = self._get_job_status(job_id)
            if status == "FAILED":
                logger.info(
                    f"database record id {db_id} for file {file_id} represents a failed batch job. \
                    Time to kill instances.")
                kill_instances = True
                break
        return kill_instances

    @retry_on_aws_too_many_requests
    def _get_job_status(self, job_id):
        response = self.batch_client.describe_jobs(jobs=[job_id])
        jobs = response.get("jobs")
        if jobs and len(jobs):
            status = jobs[0]["status"]
            return status

    def find_incomplete_batch_jobs(self):
        validation_results = self.db.run_query(
            "SELECT * from validation "
            "WHERE status = 'SCHEDULED' or status = 'VALIDATING';")
        validation_rows = validation_results.fetchall()
        checksum_results = self.db.run_query(
            "SELECT * from checksum "
            "WHERE(status='SCHEDULED' or status = 'CHECKSUMMING') "
            "and job_id is not null;")
        checksum_rows = checksum_results.fetchall()
        return checksum_rows, validation_rows

    def find_and_kill_deployment_batch_instances(self):
        instance_ids = []
        key_name = f"hca-upload-{self.deployment_stage}"
        reservations = self.ec2_client.describe_instances(
            Filters=[{
                'Name': 'key-name',
                'Values': [key_name]
            }, {
                'Name': 'instance-state-name',
                'Values': ["running"]
            }])

        instance_groups = [
            x["Instances"] for x in reservations["Reservations"]
        ]
        for group in instance_groups:
            for instance in group:
                instance_ids.append(instance['InstanceId'])
        if len(instance_ids):
            logger.info(
                f"Killing instances associated with key {key_name} and ec2 ids {str(instance_ids)}"
            )
            self.ec2_client.terminate_instances(InstanceIds=instance_ids)
        return instance_ids

    def schedule_job(self, row, table_name):
        db_id = row["id"]
        file_id = row["file_id"]
        file_id_split = file_id.split("/")
        upload_area_id = file_id_split[0]
        file_name = file_id_split[1]
        if table_name == "checksum":
            self.invoke_checksum_lambda(file_id)
        elif table_name == "validation":
            docker_image = row["docker_image"]
            # Multiple validation attempts on a file should point to the same original validation id
            original_validation_id = row["original_validation_id"]
            if not original_validation_id:
                # If there is no original_validation_id,
                # set the db id of first validation attempt as original_validation_id.
                original_validation_id = db_id
            self.schedule_validation_job(upload_area_id, file_name,
                                         docker_image, original_validation_id)
        logger.info(
            f"Marking {table_name} record id {db_id} for file {file_id} as failed."
        )
        self.db.run_query_with_params(
            f"UPDATE {table_name} SET status = 'FAILED' \
            WHERE id = %s;", (db_id))

    def schedule_validation_job(self, upload_area_id, file_name, docker_image,
                                original_validation_id):
        headers = {'Api-Key': self.api_key}
        message = {
            "validator_image": docker_image,
            "original_validation_id": original_validation_id
        }
        response = requests.put(self.api_host, headers=headers, json=message)
        if response.status_code == requests.codes.ok:
            logger.info(
                f"scheduled {upload_area_id}/{file_name} for validation")
        else:
            raise UploadException(
                f"Failed to schedule {upload_area_id}/{file_name} for validation"
            )

    def invoke_checksum_lambda(self, file_id):
        payload = {
            'Records': [{
                'eventName': 'ObjectCreated:Put',
                "s3": {
                    "bucket": {
                        "name":
                        f"org-humancellatlas-upload-{self.deployment_stage}"
                    },
                    "object": {
                        "key": file_id
                    }
                }
            }]
        }
        self.lambda_client.invoke(
            FunctionName=f"dcp-upload-csum-{self.deployment_stage}",
            InvocationType='Event',
            Payload=json.dumps(payload).encode())
        logger.info(f"scheduled {file_id} for checksumming")