def __init__(self, project_id: str, batch_id: str, batch_run_id: int): """ Constructor """ self.project_id = project_id self.batch_id = batch_id self.batch_run_id = batch_run_id self.plp_util = PLPUtil()
def __init__(self, project_id: str, pipeline_id: str): self.project_id = project_id self.pipeline_id = pipeline_id self.plp_utils = PLPUtil() self.plp_config = PLPConfiguration(project_id, pipeline_id) self.pipeline_config = self.plp_config.get_all_params() self.sender = self.pipeline_config['email_from'] self.receiver = self.pipeline_config['email_to'] self.subject = self.pipeline_config['email_subject']
def __init__(self, project_id: str, batch_id: str): """ Batch Constructor """ self.project_id = project_id self.batch_id = batch_id self.plp_utils = PLPUtil() self.plp_config = PLPConfiguration(project_id, batch_id) self.batch_config = self.plp_config.get_all_params() self.batch_run_id = int(self.batch_config['batch_run_id']) + 1 self.batch_validator = BatchValidator(project_id, batch_id, self.batch_run_id) self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") self.related_pipelines = self.batch_config['related_pipelines'].split( ",")
def __init__(self, project_id: str, batch_id: str, batch_run_id: str, pipeline_id: str, table_name: str): """ Pipeline Constructor """ self.project_id = project_id self.table_name = table_name self.batch_id = batch_id self.pipeline_id = pipeline_id self.batch_run_id = batch_run_id self.plp_utils = PLPUtil() self.plp_config = PLPConfiguration(project_id, pipeline_id) self.pipeline_config = self.plp_config.get_all_params() self.pipeline_run_id = int(self.pipeline_config['pipeline_run_id']) + 1 self.pipeline_validator = PipelineValidator(self.project_id, self.batch_id, self.pipeline_id, self.batch_run_id, self.pipeline_run_id) self.last_extract_date = datetime.strptime( str(self.pipeline_config['last_extract_date']), '%Y-%m-%d %H:%M:%S.%f') self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
class PlpAudit(): """ Data engineer job audit for both hive extract and aggregate job run""" plp_utils = None project_id = "" audit_dict = {} def __init__(self, project_id: str, pipeline_id: str): self.project_id = project_id self.pipeline_id = pipeline_id self.plp_utils = PLPUtil() self.plp_config = PLPConfiguration(project_id, pipeline_id) self.pipeline_config = self.plp_config.get_all_params() self.sender = self.pipeline_config['email_from'] self.receiver = self.pipeline_config['email_to'] self.subject = self.pipeline_config['email_subject'] def pipeline_audit(self, log_check_query, load_query): """ Pipeline audit execute job audit and loads audit data """ pipeline_status = False query_object = self.plp_utils.execute_query_job(log_check_query) res = query_object.result() if res.total_rows == 0: return pipeline_status else: pipeline_status = True for row in query_object: row_splitted = str(row).split("PIPELINE", 1)[1].split() self.audit_dict[row_splitted[0]] = row_splitted[1] pipeline_name = self.pipeline_config[row_splitted[2]] pipeline_load_query = load_query.format( v_project_id=self.project_id, v_batch_name=row_splitted[0], v_batch_id=row_splitted[1], v_pipeline_step_id=row_splitted[2], v_pipeline_name=pipeline_name, v_pipeline_log=row[0]) print(pipeline_load_query) self.plp_utils.execute_op_query(pipeline_load_query) time.sleep(1) return pipeline_status def batch_audit(self, log_check_query, load_query): """ batch audit method audit batch audit data """ batch_status = False query_object = self.plp_utils.execute_query_job(log_check_query) res = query_object.result() if res.total_rows == 0: return batch_status else: batch_status = True for row in query_object: row_splitted = str(row).split("BATCH", 1)[1].split() batch_run_id = self.audit_dict[row_splitted[0]] batch_load_query = load_query.format( v_project_id=self.project_id, v_batch_log=row[0], v_batch_name=row_splitted[0], v_batch_id=batch_run_id) print(batch_load_query) self.plp_utils.execute_op_query(batch_load_query) time.sleep(1) return batch_status def execute_audit(self): """ execute audit method execute job audit and loads audit data of each pipelines to plp audit table and sends status mail to receipents""" pipeline_start_pattern = '%PIPELINE%STARTED SUCCESSFULLY%' pipeline_complete_pattern = '%PIPELINE%COMPLETED SUCCESSFULLY%' pipeline_failed_pattern = '%PIPELINE%EXECUTION FAILED%' batch_begin_pattern = '%BATCH%BEGIN%STARTED SUCCESSFULLY%' batch_end_pattern = '%BATCH%END%COMPLETED SUCCESSFULLY%' batch_failed_pattern = '%BATCH%EXECUTION FAILED%' empty_table_query = self.pipeline_config['empty_table_query'].format( v_project_id=self.project_id) log_query_extract = self.pipeline_config['log_query_extract'] insert_query = self.pipeline_config['insert_query'] final_merge_query = self.pipeline_config['final_merge_query'].format( v_project_id=self.project_id) pipeline_end_update_query = self.pipeline_config['pipeline_end_update'] update_pipeline_failed_query = self.pipeline_config[ 'update_pipeline_failed_log'] update_batch_start_logs = self.pipeline_config[ 'update_batch_start_logs'] update_batch_end_logs = self.pipeline_config['update_batch_end_logs'] update_batch_failed_query = self.pipeline_config[ 'update_batch_failed_logs'] email_alert_query = self.pipeline_config['email_alert_query'].format( v_project_id=self.project_id) if self.plp_utils.empty_table(empty_table_query): msg = " JOB IS NOT STARTED OR LOGS ARE NOT CREATED" mail_receipents = list(self.receiver.split(",")) mail_alert = MailAlerts(self.sender, mail_receipents, self.subject, msg) mail_alert.mail_sender() else: pipeline_start_query = log_query_extract.format( v_project_id=self.project_id, v_filter=pipeline_start_pattern) pipeline_end_query = log_query_extract.format( v_project_id=self.project_id, v_filter=pipeline_complete_pattern) pipeline_failed_query = log_query_extract.format( v_project_id=self.project_id, v_filter=pipeline_failed_pattern) batch_start_query = log_query_extract.format( v_project_id=self.project_id, v_filter=batch_begin_pattern) batch_end_query = log_query_extract.format( v_project_id=self.project_id, v_filter=batch_end_pattern) batch_failed_query = log_query_extract.format( v_project_id=self.project_id, v_filter=batch_failed_pattern) if self.pipeline_audit(pipeline_start_query, insert_query): self.plp_utils.execute_query(final_merge_query) # Checking pipeline status failed if yes uploading batch failed status to audit table if self.pipeline_audit(pipeline_failed_query, update_pipeline_failed_query): self.batch_audit(batch_failed_query, update_batch_failed_query) self.batch_audit(batch_start_query, update_batch_start_logs) self.batch_audit(batch_end_query, update_batch_end_logs) else: self.pipeline_audit(pipeline_end_query, pipeline_end_update_query) self.batch_audit(batch_start_query, update_batch_start_logs) self.batch_audit(batch_end_query, update_batch_end_logs) # Sending mail for all the success and failed pipeline status mail_receipents = list(self.receiver.split(",")) df_email = self.plp_utils.execute_query(email_alert_query) mail_alert_1 = MailAlerts(self.sender, mail_receipents, self.subject, df_email) mail_alert_1.mail_sender()
class PipelineManager(): """ Pipeline Manager manages each pipeline as a single entity across all states. """ project_id = "" batch_id = "" batch_run_id = "" pipeline_id = "" pipeline_run_id = 0 table_name = "" last_extract_date = "" start_time = "" pipeline_status = "FAILED" plp_utils = None pipeline_validator = None def __init__(self, project_id: str, batch_id: str, batch_run_id: str, pipeline_id: str, table_name: str): """ Pipeline Constructor """ self.project_id = project_id self.table_name = table_name self.batch_id = batch_id self.pipeline_id = pipeline_id self.batch_run_id = batch_run_id self.plp_utils = PLPUtil() self.plp_config = PLPConfiguration(project_id, pipeline_id) self.pipeline_config = self.plp_config.get_all_params() self.pipeline_run_id = int(self.pipeline_config['pipeline_run_id']) + 1 self.pipeline_validator = PipelineValidator(self.project_id, self.batch_id, self.pipeline_id, self.batch_run_id, self.pipeline_run_id) self.last_extract_date = datetime.strptime( str(self.pipeline_config['last_extract_date']), '%Y-%m-%d %H:%M:%S.%f') self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") def __str__(self) -> str: return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : '" + self.batch_run_id \ + "', Pipeline : '" + self.pipeline_id \ + "', Pipeline run ID : '" + str(self.pipeline_run_id) \ + "', Table Name : '" + self.table_name + "']" def __repr__(self) -> str: return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : '" + self.batch_run_id \ + "', Pipeline : '" + self.pipeline_id \ + "', Pipeline run ID : '" + str(self.pipeline_run_id) \ + "', Table Name : '" + self.table_name + "']" def execute_pre_step(self): """ Pre step procedure for Pipeline. """ insert_hist_query_str = """ INSERT INTO {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H SELECT t.batch_id, {v_batch_run_id}, t.pipeline_id, {v_pipeline_run_id}, t.pipeline_name, t.table_name, '{v_pipeline_status}', '{v_pipeline_error}', t.pipeline_start_time, TIMESTAMP('{v_pipeline_start_time}'), safe.timestamp('1900-01-01 00:00:00'), {v_prune_mins}, TIMESTAMP('{v_pipeline_start_time}'), TIMESTAMP('{v_pipeline_start_time}'), {v_days_prior_to_extract} FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN t WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.table_name = '{v_table_name}' """ insert_hist_query = (insert_hist_query_str). \ format(v_project_id=self.project_id, v_pipeline_id=self.pipeline_id, v_table_name=self.table_name, v_pipeline_status="NEW", v_pipeline_error="", v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id, v_pipeline_start_time=self.start_time, v_pipeline_end_time=None, v_prune_mins=int(self. \ pipeline_config['prune_minutes']), v_days_prior_to_extract=int(self. \ pipeline_config['days_prior_to_extract']), v_batch_id=self.batch_id) self.plp_utils.execute_op_query(insert_hist_query) def execute_post_step(self): """ Post step procedure for Pipeline. """ self.pipeline_status = "SUCCESS" end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") update_run_hist_str = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t SET t.pipeline_status = '{v_pipeline_status}', t.error_message = '{v_pipeline_error}', t.pipeline_end_time = '{v_pipeline_end_time}', t.target_update_date = '{v_pipeline_end_time}' WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.table_name = '{v_table_name}' AND t.batch_run_id = {v_batch_run_id} AND t.pipeline_run_id = {v_pipeline_run_id} """ update_hist_query = (update_run_hist_str). \ format(v_project_id=self.project_id, v_pipeline_id=self.pipeline_id, v_table_name=self.table_name, v_pipeline_status=self.pipeline_status, v_pipeline_error="", v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id, v_batch_id=self.batch_id, v_pipeline_end_time=end_time) self.plp_utils.execute_op_query(update_hist_query) update_run_str = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN t SET t.pipeline_status = '{v_pipeline_status}', t.pipeline_start_time = '{v_pipeline_start_time}', t.pipeline_end_time = '{v_pipeline_end_time}', t.target_update_date = '{v_pipeline_end_time}', t.last_extract_date = '{v_last_extract_date}' WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.table_name = '{v_table_name}' """ update_query = (update_run_str). \ format(v_project_id=self.project_id, v_pipeline_id=self.pipeline_id, v_table_name=self.table_name, v_pipeline_status=self.pipeline_status, v_batch_id=self.batch_id, v_pipeline_end_time=end_time, v_pipeline_start_time=self.start_time, v_last_extract_date=self.last_extract_date) self.plp_utils.execute_op_query(update_query) update_config_str = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PARAMETER t SET t.parameter_value = '{v_param_value}', t.target_update_date = '{v_target_update_date}' WHERE t.job_name = '{v_pipeline_id}' AND t.parameter_name = '{v_param_name}' AND EXISTS (SELECT 1 from {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H WHERE batch_id = '{v_batch_id}' AND pipeline_id = '{v_pipeline_id}' AND table_name = '{v_table_name}' AND batch_run_id = {v_batch_run_id} AND pipeline_run_id = {v_pipeline_run_id}) """ update_query = (update_config_str). \ format(v_project_id=self.project_id, v_pipeline_id=self.pipeline_id, v_param_name='last_extract_date', v_param_value=self.start_time, v_table_name=self.table_name, v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id, v_batch_id=self.batch_id, v_target_update_date=end_time) self.plp_utils.execute_op_query(update_query) update_query = (update_config_str). \ format(v_project_id=self.project_id, v_pipeline_id=self.pipeline_id, v_param_name='pipeline_run_id', v_param_value=self.pipeline_run_id, v_table_name=self.table_name, v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id, v_batch_id=self.batch_id, v_target_update_date=end_time) self.plp_utils.execute_op_query(update_query) def load_aggregate_data(self): """ Uploading aggregate table from view """ # Initial load query for empty table str_initial_load_query = self.pipeline_config['initial_load_query'] # Incremental query str_incremental_query = self.pipeline_config['incremental_query'] # Name of schema were data view resides str_view_dataset_id = self.pipeline_config['view_dataset_id'] stg_dataset_name = self.pipeline_config['stg_dataset_id'] # Data view for target table str_data_view_name = self.pipeline_config['data_view_name'] # Temporary table name stage_table = self.pipeline_config['stg_table_id'] # Dataset of the input table dataset_id = self.pipeline_config['target_dataset_id'] write_disposition = 'WRITE_TRUNCATE' initial_load_query = str_initial_load_query.format( self.project_id, str_view_dataset_id, str_data_view_name) stg_to_tgt_sql = """ SELECT t.*, current_timestamp() as target_create_date, current_timestamp() as target_update_date FROM {v_project_id}.{v_dataset_id}.{v_table_name} t WHERE snapshot_date = '{v_snapshot_date}' """ if self.plp_utils.check_empty_table(self.project_id, dataset_id, self.table_name): self.plp_utils.load_query_to_bq_table(dataset_id, self.table_name, initial_load_query, write_disposition) else: self.last_extract_date.strftime("%Y-%m-%d %H:%M:%S.%f") days_prior_extract = int( self.pipeline_config['days_prior_to_extract']) list_snapshot_months = self.plp_utils. \ get_months_between(self.last_extract_date \ - timedelta(days=days_prior_extract)) for snapshot_month in list_snapshot_months: print("Snapshot date: %s" % snapshot_month) incremental_query = str_incremental_query.format( self.project_id, str_view_dataset_id, str_data_view_name, snapshot_month) print("Incremental Query : %s" % incremental_query) self.plp_utils.load_query_to_bq_table(stg_dataset_name, stage_table, incremental_query) formatted_date = snapshot_month.strftime('%Y%m%d') target_partition_table = self.table_name + '$' + formatted_date has_no_null_records = self.pipeline_validator. \ validate_not_null( stg_dataset_name, stage_table) has_no_duplicate_records = self.pipeline_validator. \ validate_no_duplicates( stg_dataset_name, stage_table) stg_to_tgt_query = stg_to_tgt_sql.format( v_project_id=self.project_id, v_dataset_id=stg_dataset_name, v_table_name=stage_table, v_snapshot_date=snapshot_month) print(" Stage to Target Query : %s" % stg_to_tgt_query) if not has_no_null_records: raise NullRecordsFoundException("Null records found in " + stg_dataset_name + "." + stage_table) if not has_no_duplicate_records: raise DuplicateRecordsFoundException( "Duplicate records found in " + stg_dataset_name + "." + stage_table) if has_no_null_records and has_no_duplicate_records: # load data to target table form stage table self.plp_utils.load_query_to_bq_table( dataset_id, target_partition_table, stg_to_tgt_query) def hive_to_bq_load(self): """ Loading data from Hive to GCS """ days_prior_extract = int(self.pipeline_config['days_prior_to_extract']) start_date = self.last_extract_date.date() - timedelta( days=days_prior_extract) end_date = datetime.strptime(self.start_time, '%Y-%m-%d %H:%M:%S.%f').date() try: gs_path = self.pipeline_config['gs_path'] gcs_folder = self.pipeline_config['gcs_folder'] gcs_file_name = self.pipeline_config['gcs_file_name'] gcs_bucket_name = self.pipeline_config['gcs_bucket_name'] # Initial Load Query str_initial_load_query = self.pipeline_config['initial_load_query'] stage_dataset = self.pipeline_config['stage_dataset'] stg_table_name = self.pipeline_config['stg_table_name'] # Query to load data from staging to target table str_target_load_query = self.pipeline_config['target_load_query'] target_load_query = str_target_load_query.format(self.project_id) write_disposition = 'WRITE_TRUNCATE' file_format = 'csv' hive_operations = HiveOperations() while start_date <= end_date: formatted_start_date = start_date.strftime('%Y%m%d') gcs_file_location = '{}/{}/{}/{}/{}_{}.{}'.format( gs_path, gcs_folder, str(formatted_start_date)[0:4], str(formatted_start_date)[0:6], gcs_file_name, formatted_start_date, file_format) prefix_file_location = '{}/{}/{}/'.format( gcs_folder, str(formatted_start_date)[0:4], str(formatted_start_date)[0:6]) processed_path = '{}/processed/{}/{}/'.format( gcs_folder, str(formatted_start_date)[0:4], str(formatted_start_date)[0:6]) hive_extract_query = str_initial_load_query.format( formatted_start_date, formatted_start_date) hive_operations.extract_data_from_hive(hive_extract_query, gcs_file_location) self.plp_utils.load_bq_table_from_gcs( gcs_file_location, self.project_id, stage_dataset, stg_table_name, write_disposition, file_format) self.plp_utils.execute_op_query(target_load_query) # moving gcs file to processed directory self.plp_utils.gcs_move_blob(gcs_bucket_name, prefix_file_location, processed_path) start_date += timedelta(days=1) except Exception as err_hive: print("Error in loading data from Hive to BQ") raise err_hive def execute_bq_query(self): """ Method to execute a give SQL query. No output is rendered """ pipeline_bq_query = self.pipeline_config["pipeline_bq_query"]. \ format(v_project_id=self.project_id) self.plp_utils.execute_op_query(pipeline_bq_query) def execute_bq_load_query(self): """ Method to execute a query and truncate load a table with out any SCD logic in place """ str_pipeline_bq_load_query = self.pipeline_config[ "pipeline_bq_load_query"] str_view_dataset_id = self.pipeline_config['view_dataset_id'] str_data_view_name = self.pipeline_config['data_view_name'] dataset_id = self.pipeline_config['target_dataset_id'] write_disposition = 'WRITE_TRUNCATE' pipeline_bq_load_query = str_pipeline_bq_load_query.format( self.project_id, str_view_dataset_id, str_data_view_name) self.plp_utils.load_query_to_bq_table(dataset_id, self.table_name, pipeline_bq_load_query, write_disposition) def api_to_bq_load(self): """ Method to load the API data to BQ """ headers = {"X-AUTH-APIKEY": os.environ["DCIM_IBX_LIST_API_KEY"]} stage_dataset_name = str( self.pipeline_config['staging_dataset']).strip() stage_table_name = str(self.pipeline_config['stage_table']) dm_dataset_name = str(self.pipeline_config['dm_dataset']).strip() data_req = DataRequest(self.pipeline_config["get_ibxs_list_url"], headers) data_req.make_get_req({"payLoad": "ibx_code"}) add_col_n_vals = { "dcim_flag": "Y", "insert_date_time": self.start_time } data_req.load_api_res_to_bq(self.project_id, \ stage_dataset_name + "." + stage_table_name, \ "replace", add_col_n_vals) api_merge_query = self.pipeline_config["merge_dcim_plp_ibx"]. \ format(v_project_id=self.project_id, v_staging_dataset=stage_dataset_name, v_dm_dataset=dm_dataset_name, v_target_table=self.table_name, v_stage_table=stage_table_name) self.plp_utils.execute_op_query(api_merge_query) def execute_pipeline(self, pipeline_type: str): """ Function to pick and execute ETL Pipeline step. """ print("Initiating pipeline....") print("- - - - - - - - - - - - -") print("Details: " + str(self)) try: print("Executing pre step....") self.execute_pre_step() print("Pre step complete....executing pipeline step") if pipeline_type == 'hive_etl': self.hive_to_bq_load() elif pipeline_type == 'api_etl': self.api_to_bq_load() elif pipeline_type == 'aggregate_etl': self.load_aggregate_data() elif pipeline_type == 'query_etl': self.execute_bq_query() elif pipeline_type == 'query_load_etl': self.execute_bq_load_query() print("Pipeline step complete..... Executing post step") self.execute_post_step() print("Post step complete..... Executing pipeline validations") print("- - - - - - - - - - - - -") self.pipeline_validator.validate_pipeline_status() print("- - - - - - - - - - - - -") self.pipeline_validator.validate_start_end_time() print("- - - - - - - - - - - - -") print("Pipeline validations complete.") except Exception as err_pipeline: print("Unable to execute pipeline") raise err_pipeline print("Pipeline execution complete !!") print("------------------------------")
class PipelineValidator(): """ Utility Class to augment a pipeline with all validations. """ project_id = "" batch_id = "" pipeline_id = "" validation_id = "Unassigned" validation_status = "NEW" batch_run_id = None pipeline_run_id = None plp_util = None null_validator = None def __init__(self, project_id: str, batch_id: str, pipeline_id: str, batch_run_id: int, pipeline_run_id: int): """ Constructor """ self.project_id = project_id self.batch_id = batch_id self.pipeline_id = pipeline_id self.batch_run_id = batch_run_id self.pipeline_run_id = pipeline_run_id self.plp_util = PLPUtil() def __str__(self): """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : " + self.batch_run_id \ + "', Pipeline ID : '" + self.pipeline_id \ + "', Pipeline run ID : '" + str(self.pipeline_run_id) \ + "', Validation ID : '" + self.validation_id \ + "', Validation Status : '" + self.validation_status +"']" def __repr__(self): """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : " + self.batch_run_id \ + "', Pipeline ID : '" + self.pipeline_id \ + "', Pipeline run ID : '" + str(self.pipeline_run_id) \ + "', Validation ID : '" + self.validation_id \ + "', Validation Status : '" + self.validation_status +"']" def validate_pipeline_status(self): """ Validate Pipeline was success """ self.validation_id = self.batch_id + "__" \ + self.pipeline_id + "__validate_status" self.validation_status = "FAILED" query_pipeline_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.batch_run_id = {v_batch_run_id} AND t.pipeline_run_id = {v_pipeline_run_id} AND t.pipeline_status = 'SUCCESS' """ query_pipeline_run = (query_pipeline_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_pipeline_id=self.pipeline_id, v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id) if self.plp_util.execute_query(query_pipeline_run).size > 0: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() def validate_start_end_time(self): """ Validate Start and End date for the pipeline """ self.validation_id = self.batch_id + "__" \ + self.pipeline_id + "__validate_start_end_time" self.validation_status = "FAILED" query_pipeline_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.batch_run_id = {v_batch_run_id} AND t.pipeline_run_id = {v_pipeline_run_id} AND t.pipeline_end_time > t.pipeline_start_time """ query_pipeline_run = (query_pipeline_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_pipeline_id=self.pipeline_id, v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id) if self.plp_util.execute_query(query_pipeline_run).size > 0: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() def update_validation_status(self): """ Method to update validation status """ curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") update_status_qry_str_1 = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_VALIDATIONS t SET t.validation_status = '{v_validation_status}', t.batch_run_id = {v_batch_run_id}, t.pipeline_run_id = {v_pipeline_run_id}, t.last_validation_datetime = '{v_validation_datetime}', t.target_update_date = '{v_validation_datetime}', """ if self.validation_status == "SUCCESS": update_status_qry_str_2 = """ t.last_success_datetime = '{v_validation_datetime}', t.last_success_batch_run_id = {v_batch_run_id}, t.last_success_pipeline_run_id = {v_pipeline_run_id} """ else: update_status_qry_str_2 = """ t.last_failure_datetime = '{v_validation_datetime}', t.last_failure_batch_run_id = {v_batch_run_id}, t.last_failure_pipeline_run_id = {v_pipeline_run_id} """ update_status_qry_str_3 = "WHERE t.validation_id = '{v_validation_id}'" update_status_qry_str = update_status_qry_str_1 \ + update_status_qry_str_2 \ + update_status_qry_str_3 update_status_qry = (update_status_qry_str).format( v_project_id=self.project_id, v_validation_status=self.validation_status, v_validation_datetime=curr_time, v_batch_run_id=self.batch_run_id, v_pipeline_run_id=self.pipeline_run_id, v_validation_id=self.validation_id) self.plp_util.execute_op_query(update_status_qry) def validate_not_null(self, dataset_id: str, table_name: str) -> bool: """ Method to validate there are no null values in integration id column """ self.validation_id = self.batch_id + "__" \ + self.pipeline_id + "__validate_not_null" self.validation_status = "FAILED" str_null_check_query = """ SELECT COUNT(1) as null_count FROM {v_project_id}.{v_dataset}.{v_table_name} WHERE integration_id is null """ null_check_query = str_null_check_query.format( v_project_id=self.project_id, v_dataset=dataset_id, v_table_name=table_name) null_check = self.plp_util.execute_query(null_check_query) has_no_null_records = null_check['null_count'].values[0] == 0 if has_no_null_records: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() return has_no_null_records def validate_no_duplicates(self, dataset_id: str, table_name: str) -> bool: """ Method to vaidate there are no duplicates in the integration id column """ self.validation_id = self.batch_id + "__" \ + self.pipeline_id + "__validate_no_duplicates" self.validation_status = "FAILED" str_query_check_no_duplicates = """ SELECT count(1) FROM {v_project_id}.{v_dataset}.{v_table_name} GROUP BY integration_id having count(*) > 1 """ query_check_no_duplicates = \ str_query_check_no_duplicates.format(v_project_id=self.project_id, v_dataset=dataset_id, v_table_name=table_name) has_no_duplicate = self.plp_util.execute_query( query_check_no_duplicates).size == 0 if has_no_duplicate: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() return has_no_duplicate
class BatchManager(): """ Batch Manager manages each batch as a single entity across all states. """ project_id = "" batch_id = "" batch_run_id = 0 start_time = "" plp_config = None plp_utils = None batch_status = "FAILED" batch_validator = None related_pipelines = [] def __init__(self, project_id: str, batch_id: str): """ Batch Constructor """ self.project_id = project_id self.batch_id = batch_id self.plp_utils = PLPUtil() self.plp_config = PLPConfiguration(project_id, batch_id) self.batch_config = self.plp_config.get_all_params() self.batch_run_id = int(self.batch_config['batch_run_id']) + 1 self.batch_validator = BatchValidator(project_id, batch_id, self.batch_run_id) self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") self.related_pipelines = self.batch_config['related_pipelines'].split( ",") def __str__(self) -> str: """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : " + str(self.batch_run_id) + "]" def __repr__(self) -> str: """ String representation of the class """ return "[Project ID: '" + self.project_id \ + "', Batch : '" + self.batch_id \ + "', Batch run ID : " + self.batch_run_id + "]" def init_batch(self): """ Init Step for Batch """ update_run_str = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN t SET t.batch_start_time = '{v_batch_start_time}' WHERE t.batch_id = '{v_batch_id}' """ update_query = (update_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_start_time=self.start_time) self.plp_utils.execute_op_query(update_query) insert_hist_query_str = """ INSERT INTO {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H SELECT '{v_batch_id}', {v_batch_run_id}, t.batch_name, '{v_batch_status}', TIMESTAMP('{v_batch_start_time}'), safe.timestamp('1900-01-01 00:00:00'), TIMESTAMP('{v_batch_start_time}'), TIMESTAMP('{v_batch_start_time}') FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN t WHERE t.batch_id = '{v_batch_id}' """ insert_hist_query = (insert_hist_query_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_run_id=self.batch_run_id, v_batch_status="NEW", v_batch_start_time=self.start_time) self.plp_utils.execute_op_query(insert_hist_query) def finalize_batch(self): """ Finalize the Batch. """ end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") update_run_hist_str = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t SET t.batch_status = '{v_batch_status}', t.batch_end_time = '{v_batch_end_time}', t.target_update_date = '{v_batch_end_time}' WHERE t.batch_id = '{v_batch_id}' AND t.batch_run_id = {v_batch_run_id} """ update_hist_query = (update_run_hist_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_run_id=self.batch_run_id, v_batch_status=self.batch_status, v_batch_end_time=end_time) self.plp_utils.execute_op_query(update_hist_query) update_run_str = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN t SET t.batch_status = '{v_batch_status}', t.batch_end_time = '{v_batch_end_time}', t.target_update_date = '{v_batch_end_time}' WHERE t.batch_id = '{v_batch_id}' """ update_query = (update_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_status=self.batch_status, v_batch_end_time=end_time) self.plp_utils.execute_op_query(update_query) update_config_str = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PARAMETER t SET t.parameter_value = '{v_param_value}', t.target_update_date = '{v_target_update_date}' WHERE t.job_name = '{v_batch_id}' AND t.parameter_name = '{v_param_name}' AND EXISTS (SELECT 1 from {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H WHERE batch_id = '{v_batch_id}' AND batch_run_id = {v_batch_run_id}) """ update_query = (update_config_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_run_id=self.batch_run_id, v_param_name='batch_run_id', v_param_value=self.batch_run_id, v_target_update_date=end_time) self.plp_utils.execute_op_query(update_query) def execute_batch(self, batch_state: str): """ Method to execute the batch based on input batch state. """ print("Initiating batch....") print("- - - - - - - - - - - - -") print("Details: " + str(self)) if batch_state == "BEGIN": print("Executing batch pre step....") self.init_batch() print("Batch pre step execution completed.") print("Batch initialization complete") elif batch_state == "END": print("Executing batch post step....") print("- - - - - - - - - - - - -") if self.batch_validator.validate_pipelines(self.related_pipelines): self.batch_status = "SUCCESS" else: raise RelatedPipelinesFailedException( "Related pipelined in [" + self.related_pipelines + "] failed") print("- - - - - - - - - - - - -") self.finalize_batch() print("Batch Post step complete..... Executing batch validations") print("- - - - - - - - - - - - -") self.batch_validator.validate_start_end_time() print("- - - - - - - - - - - - -") self.batch_validator.validate_batch_status() print("- - - - - - - - - - - - -") print("Batch validations complete.") print("Batch execution complete !!") print("==============================")
class BatchValidator(): """ Utility Class to augment a batch with all validations. """ project_id = "" batch_id = "" validation_id = "Unassigned" validation_status = "NEW" batch_run_id = None plp_util = None def __init__(self, project_id: str, batch_id: str, batch_run_id: int): """ Constructor """ self.project_id = project_id self.batch_id = batch_id self.batch_run_id = batch_run_id self.plp_util = PLPUtil() def __str__(self): """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : '" + str(self.batch_run_id) \ + "', Validation ID : '" + self.validation_id \ + "', Validation Status : '" + self.validation_status +"']" def __repr__(self): """ String representation of the class """ return "[Project ID : '" + self.project_id \ + "', Batch ID : '" + self.batch_id \ + "', Batch run ID : '" + str(self.batch_run_id) \ + "', Validation ID : '" + self.validation_id \ + "', Validation Status : '" + self.validation_status +"']" def validate_batch_status(self): """ Validate Batch was success """ self.validation_id = self.batch_id + "__validate_status" self.validation_status = "FAILED" query_batch_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t WHERE t.batch_id = '{v_batch_id}' AND t.batch_run_id = {v_batch_run_id} AND t.batch_status = 'SUCCESS' """ query_batch_run = (query_batch_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_run_id=self.batch_run_id) if self.plp_util.execute_query(query_batch_run).size > 0: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() def validate_pipelines(self, list_str_pipeline_ids: []): """ Method to verify that the given pipelines are successful. """ self.validation_id = self.batch_id + "__validate_related_pipelines" for str_pipeline_id in list_str_pipeline_ids: pipeline_is_success = self.verify_pipeline_success(str_pipeline_id.strip()) if pipeline_is_success: self.validation_status = "SUCCESS" pipeline_is_success = True else: self.validation_status = "FAILED" pipeline_is_success = False break print("Validation :" + str(self)) self.update_validation_status() return pipeline_is_success def verify_pipeline_success(self, pipeline_id: str): """ Verify give pipeline was successful """ query_pipeline_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN t WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND t.pipeline_status = 'SUCCESS' """ query_pipeline_run = (query_pipeline_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_pipeline_id=pipeline_id) return self.plp_util.execute_query(query_pipeline_run).size > 0 def validate_start_end_time(self): """ Validate Start and End date for the batch """ self.validation_id = self.batch_id + "__validate_start_end_time" self.validation_status = "FAILED" query_batch_run_str = """ SELECT COUNT(1) FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t WHERE t.batch_id = '{v_batch_id}' AND t.batch_run_id = {v_batch_run_id} AND t.batch_end_time > t.batch_start_time """ query_batch_run = (query_batch_run_str). \ format(v_project_id=self.project_id, v_batch_id=self.batch_id, v_batch_run_id=self.batch_run_id) if self.plp_util.execute_query(query_batch_run).size > 0: self.validation_status = "SUCCESS" print("Validation :" + str(self)) self.update_validation_status() def update_validation_status(self): """ Method to update validation status """ curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") update_status_qry_str_1 = """ UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_VALIDATIONS t SET t.validation_status = '{v_validation_status}', t.batch_run_id = {v_batch_run_id}, t.last_validation_datetime = '{v_validation_datetime}', t.target_update_date = '{v_validation_datetime}', """ if self.validation_status == "SUCCESS": update_status_qry_str_2 = """ t.last_success_datetime = '{v_validation_datetime}', t.last_success_batch_run_id = {v_batch_run_id} """ else: update_status_qry_str_2 = """ t.last_failure_datetime = '{v_validation_datetime}', t.last_failure_batch_run_id = {v_batch_run_id} """ update_status_qry_str_3 = "WHERE t.validation_id = '{v_validation_id}'" update_status_qry_str = update_status_qry_str_1 \ + update_status_qry_str_2 \ + update_status_qry_str_3 update_status_qry = (update_status_qry_str).format( v_project_id=self.project_id, v_validation_status=self.validation_status, v_validation_datetime=curr_time, v_batch_run_id=self.batch_run_id, v_validation_id=self.validation_id) self.plp_util.execute_op_query(update_status_qry)