class PipelineValidator(): """ Utility Class to augment a pipeline with all validations. """ project_id = "" batch_id = "" pipeline_id = "" validation_id = "Unassigned" validation_status = "NEW" batch_run_id = None pipeline_run_id = None plp_util = None null_validator = None def __init__(self, project_id: str, batch_id: str, pipeline_id: str, batch_run_id: int, pipeline_run_id: int): """ Constructor """ self.project_id = project_id self.batch_id = batch_id self.pipeline_id = pipeline_id self.batch_run_id = batch_run_id self.pipeline_run_id = pipeline_run_id self.plp_util = PLPUtil() def __str__(self): """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : " + self.batch_run_id \ + "', Pipeline ID : '" + self.pipeline_id \ + "', Pipeline run ID : '" + str(self.pipeline_run_id) \ + "', Validation ID : '" + self.validation_id \ + "', Validation Status : '" + self.validation_status +"']" def __repr__(self): """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : " + self.batch_run_id \ + "', Pipeline ID : '" + self.pipeline_id \ + "', Pipeline run ID : '" + str(self.pipeline_run_id) \ + "', Validation ID : '" + self.validation_id \ + "', Validation Status : '" + self.validation_status +"']" def validate_pipeline_status(self): """ Validate Pipeline was success """ self.validation_id = self.batch_id + "__" \ + self.pipeline_id + "__validate_status" self.validation_status = "FAILED" query_pipeline_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.batch_run_id = {v_batch_run_id} AND t.pipeline_run_id = {v_pipeline_run_id} AND t.pipeline_status = 'SUCCESS' """ query_pipeline_run = (query_pipeline_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_pipeline_id=self.pipeline_id, v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id) if self.plp_util.execute_query(query_pipeline_run).size > 0: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() def validate_start_end_time(self): """ Validate Start and End date for the pipeline """ self.validation_id = self.batch_id + "__" \ + self.pipeline_id + "__validate_start_end_time" self.validation_status = "FAILED" query_pipeline_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.batch_run_id = {v_batch_run_id} AND t.pipeline_run_id = {v_pipeline_run_id} AND t.pipeline_end_time > t.pipeline_start_time """ query_pipeline_run = (query_pipeline_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_pipeline_id=self.pipeline_id, v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id) if self.plp_util.execute_query(query_pipeline_run).size > 0: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() def update_validation_status(self): """ Method to update validation status """ curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") update_status_qry_str_1 = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_VALIDATIONS t SET t.validation_status = '{v_validation_status}', t.batch_run_id = {v_batch_run_id}, t.pipeline_run_id = {v_pipeline_run_id}, t.last_validation_datetime = '{v_validation_datetime}', t.target_update_date = '{v_validation_datetime}', """ if self.validation_status == "SUCCESS": update_status_qry_str_2 = """ t.last_success_datetime = '{v_validation_datetime}', t.last_success_batch_run_id = {v_batch_run_id}, t.last_success_pipeline_run_id = {v_pipeline_run_id} """ else: update_status_qry_str_2 = """ t.last_failure_datetime = '{v_validation_datetime}', t.last_failure_batch_run_id = {v_batch_run_id}, t.last_failure_pipeline_run_id = {v_pipeline_run_id} """ update_status_qry_str_3 = "WHERE t.validation_id = '{v_validation_id}'" update_status_qry_str = update_status_qry_str_1 \ + update_status_qry_str_2 \ + update_status_qry_str_3 update_status_qry = (update_status_qry_str).format( v_project_id=self.project_id, v_validation_status=self.validation_status, v_validation_datetime=curr_time, v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id, v_validation_id=self.validation_id) self.plp_util.execute_op_query(update_status_qry) def validate_not_null(self, dataset_id: str, table_name: str) -> bool: """ Method to validate there are no null values in integration id column """ self.validation_id = self.batch_id + "__" \ + self.pipeline_id + "__validate_not_null" self.validation_status = "FAILED" str_null_check_query = """ SELECT COUNT(1) as null_count FROM {v_project_id}.{v_dataset}.{v_table_name} WHERE integration_id is null """ null_check_query = str_null_check_query.format( v_project_id=self.project_id, v_dataset=dataset_id, v_table_name=table_name) null_check = self.plp_util.execute_query(null_check_query) has_no_null_records = null_check['null_count'].values[0] == 0 if has_no_null_records: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() return has_no_null_records def validate_no_duplicates(self, dataset_id: str, table_name: str) -> bool: """ Method to vaidate there are no duplicates in the integration id column """ self.validation_id = self.batch_id + "__" \ + self.pipeline_id + "__validate_no_duplicates" self.validation_status = "FAILED" str_query_check_no_duplicates = """ SELECT count(1) FROM {v_project_id}.{v_dataset}.{v_table_name} GROUP BY integration_id having count(*) > 1 """ query_check_no_duplicates = \ str_query_check_no_duplicates.format(v_project_id=self.project_id, v_dataset=dataset_id, v_table_name=table_name) has_no_duplicate = self.plp_util.execute_query( query_check_no_duplicates).size == 0 if has_no_duplicate: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() return has_no_duplicate
class PlpAudit(): """ Data engineer job audit for both hive extract and aggregate job run""" plp_utils = None project_id = "" audit_dict = {} def __init__(self, project_id: str, pipeline_id: str): self.project_id = project_id self.pipeline_id = pipeline_id self.plp_utils = PLPUtil() self.plp_config = PLPConfiguration(project_id, pipeline_id) self.pipeline_config = self.plp_config.get_all_params() self.sender = self.pipeline_config['email_from'] self.receiver = self.pipeline_config['email_to'] self.subject = self.pipeline_config['email_subject'] def pipeline_audit(self, log_check_query, load_query): """ Pipeline audit execute job audit and loads audit data """ pipeline_status = False query_object = self.plp_utils.execute_query_job(log_check_query) res = query_object.result() if res.total_rows == 0: return pipeline_status else: pipeline_status = True for row in query_object: row_splitted = str(row).split("PIPELINE", 1)[1].split() self.audit_dict[row_splitted[0]] = row_splitted[1] pipeline_name = self.pipeline_config[row_splitted[2]] pipeline_load_query = load_query.format( v_project_id=self.project_id, v_batch_name=row_splitted[0], v_batch_id=row_splitted[1], v_pipeline_step_id=row_splitted[2], v_pipeline_name=pipeline_name, v_pipeline_log=row[0]) print(pipeline_load_query) self.plp_utils.execute_op_query(pipeline_load_query) time.sleep(1) return pipeline_status def batch_audit(self, log_check_query, load_query): """ batch audit method audit batch audit data """ batch_status = False query_object = self.plp_utils.execute_query_job(log_check_query) res = query_object.result() if res.total_rows == 0: return batch_status else: batch_status = True for row in query_object: row_splitted = str(row).split("BATCH", 1)[1].split() batch_run_id = self.audit_dict[row_splitted[0]] batch_load_query = load_query.format( v_project_id=self.project_id, v_batch_log=row[0], v_batch_name=row_splitted[0], v_batch_id=batch_run_id) print(batch_load_query) self.plp_utils.execute_op_query(batch_load_query) time.sleep(1) return batch_status def execute_audit(self): """ execute audit method execute job audit and loads audit data of each pipelines to plp audit table and sends status mail to receipents""" pipeline_start_pattern = '%PIPELINE%STARTED SUCCESSFULLY%' pipeline_complete_pattern = '%PIPELINE%COMPLETED SUCCESSFULLY%' pipeline_failed_pattern = '%PIPELINE%EXECUTION FAILED%' batch_begin_pattern = '%BATCH%BEGIN%STARTED SUCCESSFULLY%' batch_end_pattern = '%BATCH%END%COMPLETED SUCCESSFULLY%' batch_failed_pattern = '%BATCH%EXECUTION FAILED%' empty_table_query = self.pipeline_config['empty_table_query'].format( v_project_id=self.project_id) log_query_extract = self.pipeline_config['log_query_extract'] insert_query = self.pipeline_config['insert_query'] final_merge_query = self.pipeline_config['final_merge_query'].format( v_project_id=self.project_id) pipeline_end_update_query = self.pipeline_config['pipeline_end_update'] update_pipeline_failed_query = self.pipeline_config[ 'update_pipeline_failed_log'] update_batch_start_logs = self.pipeline_config[ 'update_batch_start_logs'] update_batch_end_logs = self.pipeline_config['update_batch_end_logs'] update_batch_failed_query = self.pipeline_config[ 'update_batch_failed_logs'] email_alert_query = self.pipeline_config['email_alert_query'].format( v_project_id=self.project_id) if self.plp_utils.empty_table(empty_table_query): msg = " JOB IS NOT STARTED OR LOGS ARE NOT CREATED" mail_receipents = list(self.receiver.split(",")) mail_alert = MailAlerts(self.sender, mail_receipents, self.subject, msg) mail_alert.mail_sender() else: pipeline_start_query = log_query_extract.format( v_project_id=self.project_id, v_filter=pipeline_start_pattern) pipeline_end_query = log_query_extract.format( v_project_id=self.project_id, v_filter=pipeline_complete_pattern) pipeline_failed_query = log_query_extract.format( v_project_id=self.project_id, v_filter=pipeline_failed_pattern) batch_start_query = log_query_extract.format( v_project_id=self.project_id, v_filter=batch_begin_pattern) batch_end_query = log_query_extract.format( v_project_id=self.project_id, v_filter=batch_end_pattern) batch_failed_query = log_query_extract.format( v_project_id=self.project_id, v_filter=batch_failed_pattern) if self.pipeline_audit(pipeline_start_query, insert_query): self.plp_utils.execute_query(final_merge_query) # Checking pipeline status failed if yes uploading batch failed status to audit table if self.pipeline_audit(pipeline_failed_query, update_pipeline_failed_query): self.batch_audit(batch_failed_query, update_batch_failed_query) self.batch_audit(batch_start_query, update_batch_start_logs) self.batch_audit(batch_end_query, update_batch_end_logs) else: self.pipeline_audit(pipeline_end_query, pipeline_end_update_query) self.batch_audit(batch_start_query, update_batch_start_logs) self.batch_audit(batch_end_query, update_batch_end_logs) # Sending mail for all the success and failed pipeline status mail_receipents = list(self.receiver.split(",")) df_email = self.plp_utils.execute_query(email_alert_query) mail_alert_1 = MailAlerts(self.sender, mail_receipents, self.subject, df_email) mail_alert_1.mail_sender()
class BatchValidator(): """ Utility Class to augment a batch with all validations. """ project_id = "" batch_id = "" validation_id = "Unassigned" validation_status = "NEW" batch_run_id = None plp_util = None def __init__(self, project_id: str, batch_id: str, batch_run_id: int): """ Constructor """ self.project_id = project_id self.batch_id = batch_id self.batch_run_id = batch_run_id self.plp_util = PLPUtil() def __str__(self): """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : '" + str(self.batch_run_id) \ + "', Validation ID : '" + self.validation_id \ + "', Validation Status : '" + self.validation_status +"']" def __repr__(self): """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : '" + str(self.batch_run_id) \ + "', Validation ID : '" + self.validation_id \ + "', Validation Status : '" + self.validation_status +"']" def validate_batch_status(self): """ Validate Batch was success """ self.validation_id = self.batch_id + "__validate_status" self.validation_status = "FAILED" query_batch_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t WHERE t.batch_id = '{v_batch_id}' AND t.batch_run_id = {v_batch_run_id} AND t.batch_status = 'SUCCESS' """ query_batch_run = (query_batch_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_run_id=self.batch_run_id) if self.plp_util.execute_query(query_batch_run).size > 0: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() def validate_pipelines(self, list_str_pipeline_ids: []): """ Method to verify that the given pipelines are successful. """ self.validation_id = self.batch_id + "__validate_related_pipelines" for str_pipeline_id in list_str_pipeline_ids: pipeline_is_success = self.verify_pipeline_success(str_pipeline_id.strip()) if pipeline_is_success: self.validation_status = "SUCCESS" pipeline_is_success = True else: self.validation_status = "FAILED" pipeline_is_success = False break print("Validation :" + str(self)) self.update_validation_status() return pipeline_is_success def verify_pipeline_success(self, pipeline_id: str): """ Verify give pipeline was successful """ query_pipeline_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN t WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.pipeline_status = 'SUCCESS' """ query_pipeline_run = (query_pipeline_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_pipeline_id=pipeline_id) return self.plp_util.execute_query(query_pipeline_run).size > 0 def validate_start_end_time(self): """ Validate Start and End date for the batch """ self.validation_id = self.batch_id + "__validate_start_end_time" self.validation_status = "FAILED" query_batch_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t WHERE t.batch_id = '{v_batch_id}' AND t.batch_run_id = {v_batch_run_id} AND t.batch_end_time > t.batch_start_time """ query_batch_run = (query_batch_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_run_id=self.batch_run_id) if self.plp_util.execute_query(query_batch_run).size > 0: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() def update_validation_status(self): """ Method to update validation status """ curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") update_status_qry_str_1 = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_VALIDATIONS t SET t.validation_status = '{v_validation_status}', t.batch_run_id = {v_batch_run_id}, t.last_validation_datetime = '{v_validation_datetime}', t.target_update_date = '{v_validation_datetime}', """ if self.validation_status == "SUCCESS": update_status_qry_str_2 = """ t.last_success_datetime = '{v_validation_datetime}', t.last_success_batch_run_id = {v_batch_run_id} """ else: update_status_qry_str_2 = """ t.last_failure_datetime = '{v_validation_datetime}', t.last_failure_batch_run_id = {v_batch_run_id} """ update_status_qry_str_3 = "WHERE t.validation_id = '{v_validation_id}'" update_status_qry_str = update_status_qry_str_1 \ + update_status_qry_str_2 \ + update_status_qry_str_3 update_status_qry = (update_status_qry_str).format( v_project_id=self.project_id, v_validation_status=self.validation_status, v_validation_datetime=curr_time, v_batch_run_id=self.batch_run_id, v_validation_id=self.validation_id) self.plp_util.execute_op_query(update_status_qry)