class PipelineValidator():
    """
    Utility Class to augment a pipeline with all validations.
    """
    project_id = ""
    batch_id = ""
    pipeline_id = ""
    validation_id = "Unassigned"
    validation_status = "NEW"
    batch_run_id = None
    pipeline_run_id = None
    plp_util = None
    null_validator = None

    def __init__(self, project_id: str, batch_id: str, pipeline_id: str,
                 batch_run_id: int, pipeline_run_id: int):
        """
            Constructor
        """
        self.project_id = project_id
        self.batch_id = batch_id
        self.pipeline_id = pipeline_id
        self.batch_run_id = batch_run_id
        self.pipeline_run_id = pipeline_run_id
        self.plp_util = PLPUtil()

    def __str__(self):
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : " + self.batch_run_id \
            + "', Pipeline ID : '" + self.pipeline_id \
            + "', Pipeline run ID : '" + str(self.pipeline_run_id) \
            + "', Validation ID : '" + self.validation_id \
            + "', Validation Status : '" + self.validation_status +"']"

    def __repr__(self):
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : " + self.batch_run_id \
            + "', Pipeline ID : '" + self.pipeline_id \
            + "', Pipeline run ID : '" + str(self.pipeline_run_id) \
            + "', Validation ID : '" + self.validation_id \
            + "', Validation Status : '" + self.validation_status +"']"

    def validate_pipeline_status(self):
        """
        Validate Pipeline was success
        """
        self.validation_id = self.batch_id + "__" \
                        + self.pipeline_id + "__validate_status"
        self.validation_status = "FAILED"
        query_pipeline_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.pipeline_id = '{v_pipeline_id}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.pipeline_run_id = {v_pipeline_run_id}
            AND t.pipeline_status = 'SUCCESS'
        """
        query_pipeline_run = (query_pipeline_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_pipeline_id=self.pipeline_id,
                   v_batch_run_id=self.batch_run_id,
                   v_pipeline_run_id=self.pipeline_run_id)

        if self.plp_util.execute_query(query_pipeline_run).size > 0:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

    def validate_start_end_time(self):
        """
        Validate Start and End date for the pipeline
        """
        self.validation_id = self.batch_id + "__" \
                        + self.pipeline_id + "__validate_start_end_time"
        self.validation_status = "FAILED"
        query_pipeline_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.pipeline_id = '{v_pipeline_id}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.pipeline_run_id = {v_pipeline_run_id}
            AND t.pipeline_end_time > t.pipeline_start_time
        """
        query_pipeline_run = (query_pipeline_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_pipeline_id=self.pipeline_id,
                   v_batch_run_id=self.batch_run_id,
                   v_pipeline_run_id=self.pipeline_run_id)

        if self.plp_util.execute_query(query_pipeline_run).size > 0:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

    def update_validation_status(self):
        """
        Method to update validation status
        """
        curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
        update_status_qry_str_1 = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_VALIDATIONS t
            SET t.validation_status = '{v_validation_status}',
            t.batch_run_id = {v_batch_run_id},
            t.pipeline_run_id = {v_pipeline_run_id},
            t.last_validation_datetime = '{v_validation_datetime}',
            t.target_update_date  = '{v_validation_datetime}',
        """
        if self.validation_status == "SUCCESS":
            update_status_qry_str_2 = """
            t.last_success_datetime = '{v_validation_datetime}',
            t.last_success_batch_run_id = {v_batch_run_id},
            t.last_success_pipeline_run_id = {v_pipeline_run_id}
        """
        else:
            update_status_qry_str_2 = """
            t.last_failure_datetime = '{v_validation_datetime}',
            t.last_failure_batch_run_id = {v_batch_run_id},
            t.last_failure_pipeline_run_id = {v_pipeline_run_id}
        """

        update_status_qry_str_3 = "WHERE t.validation_id = '{v_validation_id}'"

        update_status_qry_str = update_status_qry_str_1 \
                                + update_status_qry_str_2 \
                                + update_status_qry_str_3

        update_status_qry = (update_status_qry_str).format(
            v_project_id=self.project_id,
            v_validation_status=self.validation_status,
            v_validation_datetime=curr_time,
            v_batch_run_id=self.batch_run_id,
            v_pipeline_run_id=self.pipeline_run_id,
            v_validation_id=self.validation_id)

        self.plp_util.execute_op_query(update_status_qry)

    def validate_not_null(self, dataset_id: str, table_name: str) -> bool:
        """
        Method to validate there are no null values in integration id column
        """
        self.validation_id = self.batch_id + "__" \
                        + self.pipeline_id + "__validate_not_null"
        self.validation_status = "FAILED"

        str_null_check_query = """
            SELECT COUNT(1) as null_count FROM {v_project_id}.{v_dataset}.{v_table_name}
            WHERE integration_id is null
        """

        null_check_query = str_null_check_query.format(
            v_project_id=self.project_id,
            v_dataset=dataset_id,
            v_table_name=table_name)

        null_check = self.plp_util.execute_query(null_check_query)
        has_no_null_records = null_check['null_count'].values[0] == 0

        if has_no_null_records:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

        return has_no_null_records

    def validate_no_duplicates(self, dataset_id: str, table_name: str) -> bool:
        """
        Method to vaidate there are no duplicates in the integration id column
        """
        self.validation_id = self.batch_id + "__" \
                        + self.pipeline_id + "__validate_no_duplicates"
        self.validation_status = "FAILED"
        str_query_check_no_duplicates = """
            SELECT count(1)
            FROM {v_project_id}.{v_dataset}.{v_table_name}
            GROUP BY integration_id having count(*) > 1
        """
        query_check_no_duplicates = \
            str_query_check_no_duplicates.format(v_project_id=self.project_id,
                                                 v_dataset=dataset_id,
                                                 v_table_name=table_name)

        has_no_duplicate = self.plp_util.execute_query(
            query_check_no_duplicates).size == 0

        if has_no_duplicate:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

        return has_no_duplicate
Пример #2
0
class PlpAudit():
    """ Data engineer job audit for both hive extract and aggregate job run"""
    plp_utils = None
    project_id = ""
    audit_dict = {}

    def __init__(self, project_id: str, pipeline_id: str):
        self.project_id = project_id
        self.pipeline_id = pipeline_id
        self.plp_utils = PLPUtil()
        self.plp_config = PLPConfiguration(project_id, pipeline_id)
        self.pipeline_config = self.plp_config.get_all_params()
        self.sender = self.pipeline_config['email_from']
        self.receiver = self.pipeline_config['email_to']
        self.subject = self.pipeline_config['email_subject']

    def pipeline_audit(self, log_check_query, load_query):
        """ Pipeline audit execute job audit and loads audit data """
        pipeline_status = False
        query_object = self.plp_utils.execute_query_job(log_check_query)
        res = query_object.result()
        if res.total_rows == 0:
            return pipeline_status
        else:
            pipeline_status = True
            for row in query_object:
                row_splitted = str(row).split("PIPELINE", 1)[1].split()
                self.audit_dict[row_splitted[0]] = row_splitted[1]
                pipeline_name = self.pipeline_config[row_splitted[2]]
                pipeline_load_query = load_query.format(
                    v_project_id=self.project_id,
                    v_batch_name=row_splitted[0],
                    v_batch_id=row_splitted[1],
                    v_pipeline_step_id=row_splitted[2],
                    v_pipeline_name=pipeline_name,
                    v_pipeline_log=row[0])
                print(pipeline_load_query)
                self.plp_utils.execute_op_query(pipeline_load_query)
                time.sleep(1)
            return pipeline_status

    def batch_audit(self, log_check_query, load_query):
        """ batch audit method audit batch audit data  """
        batch_status = False
        query_object = self.plp_utils.execute_query_job(log_check_query)
        res = query_object.result()
        if res.total_rows == 0:
            return batch_status
        else:
            batch_status = True
            for row in query_object:
                row_splitted = str(row).split("BATCH", 1)[1].split()
                batch_run_id = self.audit_dict[row_splitted[0]]
                batch_load_query = load_query.format(
                    v_project_id=self.project_id,
                    v_batch_log=row[0],
                    v_batch_name=row_splitted[0],
                    v_batch_id=batch_run_id)
                print(batch_load_query)
                self.plp_utils.execute_op_query(batch_load_query)
                time.sleep(1)
            return batch_status

    def execute_audit(self):
        """ execute audit method  execute job audit and loads audit data of each pipelines to plp audit table and sends status mail to receipents"""
        pipeline_start_pattern = '%PIPELINE%STARTED SUCCESSFULLY%'
        pipeline_complete_pattern = '%PIPELINE%COMPLETED SUCCESSFULLY%'
        pipeline_failed_pattern = '%PIPELINE%EXECUTION FAILED%'
        batch_begin_pattern = '%BATCH%BEGIN%STARTED SUCCESSFULLY%'
        batch_end_pattern = '%BATCH%END%COMPLETED SUCCESSFULLY%'
        batch_failed_pattern = '%BATCH%EXECUTION FAILED%'
        empty_table_query = self.pipeline_config['empty_table_query'].format(
            v_project_id=self.project_id)
        log_query_extract = self.pipeline_config['log_query_extract']
        insert_query = self.pipeline_config['insert_query']
        final_merge_query = self.pipeline_config['final_merge_query'].format(
            v_project_id=self.project_id)
        pipeline_end_update_query = self.pipeline_config['pipeline_end_update']
        update_pipeline_failed_query = self.pipeline_config[
            'update_pipeline_failed_log']
        update_batch_start_logs = self.pipeline_config[
            'update_batch_start_logs']
        update_batch_end_logs = self.pipeline_config['update_batch_end_logs']
        update_batch_failed_query = self.pipeline_config[
            'update_batch_failed_logs']
        email_alert_query = self.pipeline_config['email_alert_query'].format(
            v_project_id=self.project_id)
        if self.plp_utils.empty_table(empty_table_query):
            msg = " JOB IS NOT STARTED OR LOGS ARE NOT CREATED"
            mail_receipents = list(self.receiver.split(","))
            mail_alert = MailAlerts(self.sender, mail_receipents, self.subject,
                                    msg)
            mail_alert.mail_sender()
        else:
            pipeline_start_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=pipeline_start_pattern)
            pipeline_end_query = log_query_extract.format(
                v_project_id=self.project_id,
                v_filter=pipeline_complete_pattern)
            pipeline_failed_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=pipeline_failed_pattern)
            batch_start_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=batch_begin_pattern)
            batch_end_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=batch_end_pattern)
            batch_failed_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=batch_failed_pattern)
            if self.pipeline_audit(pipeline_start_query, insert_query):
                self.plp_utils.execute_query(final_merge_query)
                # Checking pipeline status failed if yes uploading batch failed status to audit table
                if self.pipeline_audit(pipeline_failed_query,
                                       update_pipeline_failed_query):
                    self.batch_audit(batch_failed_query,
                                     update_batch_failed_query)
                    self.batch_audit(batch_start_query,
                                     update_batch_start_logs)
                    self.batch_audit(batch_end_query, update_batch_end_logs)
                else:
                    self.pipeline_audit(pipeline_end_query,
                                        pipeline_end_update_query)
                    self.batch_audit(batch_start_query,
                                     update_batch_start_logs)
                    self.batch_audit(batch_end_query, update_batch_end_logs)
                # Sending mail for all the success and failed pipeline status
                mail_receipents = list(self.receiver.split(","))
                df_email = self.plp_utils.execute_query(email_alert_query)
                mail_alert_1 = MailAlerts(self.sender, mail_receipents,
                                          self.subject, df_email)
                mail_alert_1.mail_sender()
class BatchValidator():
    """
    Utility Class to augment a batch with all validations.
    """
    project_id = ""
    batch_id = ""
    validation_id = "Unassigned"
    validation_status = "NEW"
    batch_run_id = None
    plp_util = None

    def __init__(self, project_id: str, batch_id: str,
                 batch_run_id: int):
        """
            Constructor
        """
        self.project_id = project_id
        self.batch_id = batch_id
        self.batch_run_id = batch_run_id
        self.plp_util = PLPUtil()

    def __str__(self):
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : '" + str(self.batch_run_id) \
            + "', Validation ID : '" + self.validation_id \
            + "', Validation Status : '" + self.validation_status +"']"

    def __repr__(self):
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : '" + str(self.batch_run_id) \
            + "', Validation ID : '" + self.validation_id \
            + "', Validation Status : '" + self.validation_status +"']"

    def validate_batch_status(self):
        """
        Validate Batch was success
        """
        self.validation_id = self.batch_id + "__validate_status"
        self.validation_status = "FAILED"
        query_batch_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.batch_status = 'SUCCESS'
        """
        query_batch_run = (query_batch_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_run_id=self.batch_run_id)

        if self.plp_util.execute_query(query_batch_run).size > 0:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

    def validate_pipelines(self, list_str_pipeline_ids: []):
        """
        Method to verify that the given pipelines are successful.
        """
        self.validation_id = self.batch_id + "__validate_related_pipelines"

        for str_pipeline_id in list_str_pipeline_ids:
            pipeline_is_success = self.verify_pipeline_success(str_pipeline_id.strip())
            if pipeline_is_success:
                self.validation_status = "SUCCESS"
                pipeline_is_success = True
            else:
                self.validation_status = "FAILED"
                pipeline_is_success = False
                break

        print("Validation :" + str(self))
        self.update_validation_status()

        return pipeline_is_success

    def verify_pipeline_success(self, pipeline_id: str):
        """
        Verify give pipeline was successful
        """
        query_pipeline_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.pipeline_id = '{v_pipeline_id}'
            AND t.pipeline_status = 'SUCCESS'
        """
        query_pipeline_run = (query_pipeline_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_pipeline_id=pipeline_id)

        return self.plp_util.execute_query(query_pipeline_run).size > 0

    def validate_start_end_time(self):
        """
        Validate Start and End date for the batch
        """
        self.validation_id = self.batch_id + "__validate_start_end_time"
        self.validation_status = "FAILED"
        query_batch_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.batch_end_time > t.batch_start_time
        """
        query_batch_run = (query_batch_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_run_id=self.batch_run_id)

        if self.plp_util.execute_query(query_batch_run).size > 0:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

    def update_validation_status(self):
        """
        Method to update validation status
        """
        curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
        update_status_qry_str_1 = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_VALIDATIONS t
            SET t.validation_status = '{v_validation_status}',
            t.batch_run_id = {v_batch_run_id},
            t.last_validation_datetime = '{v_validation_datetime}',
            t.target_update_date  = '{v_validation_datetime}',
        """
        if self.validation_status == "SUCCESS":
            update_status_qry_str_2 = """
            t.last_success_datetime = '{v_validation_datetime}',
            t.last_success_batch_run_id = {v_batch_run_id}
        """
        else:
            update_status_qry_str_2 = """
            t.last_failure_datetime = '{v_validation_datetime}',
            t.last_failure_batch_run_id = {v_batch_run_id}
        """

        update_status_qry_str_3 = "WHERE t.validation_id = '{v_validation_id}'"

        update_status_qry_str = update_status_qry_str_1 \
                                + update_status_qry_str_2 \
                                + update_status_qry_str_3

        update_status_qry = (update_status_qry_str).format(
            v_project_id=self.project_id,
            v_validation_status=self.validation_status,
            v_validation_datetime=curr_time,
            v_batch_run_id=self.batch_run_id,
            v_validation_id=self.validation_id)

        self.plp_util.execute_op_query(update_status_qry)