def __init__(self, project_id: str, batch_id: str,
              batch_run_id: int):
     """
         Constructor
     """
     self.project_id = project_id
     self.batch_id = batch_id
     self.batch_run_id = batch_run_id
     self.plp_util = PLPUtil()
예제 #2
0
 def __init__(self, project_id: str, pipeline_id: str):
     self.project_id = project_id
     self.pipeline_id = pipeline_id
     self.plp_utils = PLPUtil()
     self.plp_config = PLPConfiguration(project_id, pipeline_id)
     self.pipeline_config = self.plp_config.get_all_params()
     self.sender = self.pipeline_config['email_from']
     self.receiver = self.pipeline_config['email_to']
     self.subject = self.pipeline_config['email_subject']
예제 #3
0
 def __init__(self, project_id: str, batch_id: str):
     """
     Batch Constructor
     """
     self.project_id = project_id
     self.batch_id = batch_id
     self.plp_utils = PLPUtil()
     self.plp_config = PLPConfiguration(project_id, batch_id)
     self.batch_config = self.plp_config.get_all_params()
     self.batch_run_id = int(self.batch_config['batch_run_id']) + 1
     self.batch_validator = BatchValidator(project_id, batch_id,
                                           self.batch_run_id)
     self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
     self.related_pipelines = self.batch_config['related_pipelines'].split(
         ",")
예제 #4
0
 def __init__(self, project_id: str, batch_id: str, batch_run_id: str,
              pipeline_id: str, table_name: str):
     """
     Pipeline Constructor
     """
     self.project_id = project_id
     self.table_name = table_name
     self.batch_id = batch_id
     self.pipeline_id = pipeline_id
     self.batch_run_id = batch_run_id
     self.plp_utils = PLPUtil()
     self.plp_config = PLPConfiguration(project_id, pipeline_id)
     self.pipeline_config = self.plp_config.get_all_params()
     self.pipeline_run_id = int(self.pipeline_config['pipeline_run_id']) + 1
     self.pipeline_validator = PipelineValidator(self.project_id,
                                                 self.batch_id,
                                                 self.pipeline_id,
                                                 self.batch_run_id,
                                                 self.pipeline_run_id)
     self.last_extract_date = datetime.strptime(
         str(self.pipeline_config['last_extract_date']),
         '%Y-%m-%d %H:%M:%S.%f')
     self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
예제 #5
0
class PlpAudit():
    """ Data engineer job audit for both hive extract and aggregate job run"""
    plp_utils = None
    project_id = ""
    audit_dict = {}

    def __init__(self, project_id: str, pipeline_id: str):
        self.project_id = project_id
        self.pipeline_id = pipeline_id
        self.plp_utils = PLPUtil()
        self.plp_config = PLPConfiguration(project_id, pipeline_id)
        self.pipeline_config = self.plp_config.get_all_params()
        self.sender = self.pipeline_config['email_from']
        self.receiver = self.pipeline_config['email_to']
        self.subject = self.pipeline_config['email_subject']

    def pipeline_audit(self, log_check_query, load_query):
        """ Pipeline audit execute job audit and loads audit data """
        pipeline_status = False
        query_object = self.plp_utils.execute_query_job(log_check_query)
        res = query_object.result()
        if res.total_rows == 0:
            return pipeline_status
        else:
            pipeline_status = True
            for row in query_object:
                row_splitted = str(row).split("PIPELINE", 1)[1].split()
                self.audit_dict[row_splitted[0]] = row_splitted[1]
                pipeline_name = self.pipeline_config[row_splitted[2]]
                pipeline_load_query = load_query.format(
                    v_project_id=self.project_id,
                    v_batch_name=row_splitted[0],
                    v_batch_id=row_splitted[1],
                    v_pipeline_step_id=row_splitted[2],
                    v_pipeline_name=pipeline_name,
                    v_pipeline_log=row[0])
                print(pipeline_load_query)
                self.plp_utils.execute_op_query(pipeline_load_query)
                time.sleep(1)
            return pipeline_status

    def batch_audit(self, log_check_query, load_query):
        """ batch audit method audit batch audit data  """
        batch_status = False
        query_object = self.plp_utils.execute_query_job(log_check_query)
        res = query_object.result()
        if res.total_rows == 0:
            return batch_status
        else:
            batch_status = True
            for row in query_object:
                row_splitted = str(row).split("BATCH", 1)[1].split()
                batch_run_id = self.audit_dict[row_splitted[0]]
                batch_load_query = load_query.format(
                    v_project_id=self.project_id,
                    v_batch_log=row[0],
                    v_batch_name=row_splitted[0],
                    v_batch_id=batch_run_id)
                print(batch_load_query)
                self.plp_utils.execute_op_query(batch_load_query)
                time.sleep(1)
            return batch_status

    def execute_audit(self):
        """ execute audit method  execute job audit and loads audit data of each pipelines to plp audit table and sends status mail to receipents"""
        pipeline_start_pattern = '%PIPELINE%STARTED SUCCESSFULLY%'
        pipeline_complete_pattern = '%PIPELINE%COMPLETED SUCCESSFULLY%'
        pipeline_failed_pattern = '%PIPELINE%EXECUTION FAILED%'
        batch_begin_pattern = '%BATCH%BEGIN%STARTED SUCCESSFULLY%'
        batch_end_pattern = '%BATCH%END%COMPLETED SUCCESSFULLY%'
        batch_failed_pattern = '%BATCH%EXECUTION FAILED%'
        empty_table_query = self.pipeline_config['empty_table_query'].format(
            v_project_id=self.project_id)
        log_query_extract = self.pipeline_config['log_query_extract']
        insert_query = self.pipeline_config['insert_query']
        final_merge_query = self.pipeline_config['final_merge_query'].format(
            v_project_id=self.project_id)
        pipeline_end_update_query = self.pipeline_config['pipeline_end_update']
        update_pipeline_failed_query = self.pipeline_config[
            'update_pipeline_failed_log']
        update_batch_start_logs = self.pipeline_config[
            'update_batch_start_logs']
        update_batch_end_logs = self.pipeline_config['update_batch_end_logs']
        update_batch_failed_query = self.pipeline_config[
            'update_batch_failed_logs']
        email_alert_query = self.pipeline_config['email_alert_query'].format(
            v_project_id=self.project_id)
        if self.plp_utils.empty_table(empty_table_query):
            msg = " JOB IS NOT STARTED OR LOGS ARE NOT CREATED"
            mail_receipents = list(self.receiver.split(","))
            mail_alert = MailAlerts(self.sender, mail_receipents, self.subject,
                                    msg)
            mail_alert.mail_sender()
        else:
            pipeline_start_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=pipeline_start_pattern)
            pipeline_end_query = log_query_extract.format(
                v_project_id=self.project_id,
                v_filter=pipeline_complete_pattern)
            pipeline_failed_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=pipeline_failed_pattern)
            batch_start_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=batch_begin_pattern)
            batch_end_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=batch_end_pattern)
            batch_failed_query = log_query_extract.format(
                v_project_id=self.project_id, v_filter=batch_failed_pattern)
            if self.pipeline_audit(pipeline_start_query, insert_query):
                self.plp_utils.execute_query(final_merge_query)
                # Checking pipeline status failed if yes uploading batch failed status to audit table
                if self.pipeline_audit(pipeline_failed_query,
                                       update_pipeline_failed_query):
                    self.batch_audit(batch_failed_query,
                                     update_batch_failed_query)
                    self.batch_audit(batch_start_query,
                                     update_batch_start_logs)
                    self.batch_audit(batch_end_query, update_batch_end_logs)
                else:
                    self.pipeline_audit(pipeline_end_query,
                                        pipeline_end_update_query)
                    self.batch_audit(batch_start_query,
                                     update_batch_start_logs)
                    self.batch_audit(batch_end_query, update_batch_end_logs)
                # Sending mail for all the success and failed pipeline status
                mail_receipents = list(self.receiver.split(","))
                df_email = self.plp_utils.execute_query(email_alert_query)
                mail_alert_1 = MailAlerts(self.sender, mail_receipents,
                                          self.subject, df_email)
                mail_alert_1.mail_sender()
예제 #6
0
class PipelineManager():
    """
    Pipeline Manager manages each pipeline as a single entity
    across all states.
    """
    project_id = ""
    batch_id = ""
    batch_run_id = ""
    pipeline_id = ""
    pipeline_run_id = 0
    table_name = ""
    last_extract_date = ""
    start_time = ""
    pipeline_status = "FAILED"
    plp_utils = None
    pipeline_validator = None

    def __init__(self, project_id: str, batch_id: str, batch_run_id: str,
                 pipeline_id: str, table_name: str):
        """
        Pipeline Constructor
        """
        self.project_id = project_id
        self.table_name = table_name
        self.batch_id = batch_id
        self.pipeline_id = pipeline_id
        self.batch_run_id = batch_run_id
        self.plp_utils = PLPUtil()
        self.plp_config = PLPConfiguration(project_id, pipeline_id)
        self.pipeline_config = self.plp_config.get_all_params()
        self.pipeline_run_id = int(self.pipeline_config['pipeline_run_id']) + 1
        self.pipeline_validator = PipelineValidator(self.project_id,
                                                    self.batch_id,
                                                    self.pipeline_id,
                                                    self.batch_run_id,
                                                    self.pipeline_run_id)
        self.last_extract_date = datetime.strptime(
            str(self.pipeline_config['last_extract_date']),
            '%Y-%m-%d %H:%M:%S.%f')
        self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")

    def __str__(self) -> str:
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : '" + self.batch_run_id \
            + "', Pipeline : '" + self.pipeline_id \
            + "', Pipeline run ID : '" + str(self.pipeline_run_id) \
            + "', Table Name : '" + self.table_name + "']"

    def __repr__(self) -> str:
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : '" + self.batch_run_id \
            + "', Pipeline : '" + self.pipeline_id \
            + "', Pipeline run ID : '" + str(self.pipeline_run_id) \
            + "', Table Name : '" + self.table_name + "']"

    def execute_pre_step(self):
        """
        Pre step procedure for Pipeline.
        """
        insert_hist_query_str = """
            INSERT INTO {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H
            SELECT t.batch_id, {v_batch_run_id}, t.pipeline_id, {v_pipeline_run_id}, t.pipeline_name, 
            t.table_name, '{v_pipeline_status}', '{v_pipeline_error}', t.pipeline_start_time,
            TIMESTAMP('{v_pipeline_start_time}'), safe.timestamp('1900-01-01 00:00:00'),
            {v_prune_mins}, TIMESTAMP('{v_pipeline_start_time}'),
            TIMESTAMP('{v_pipeline_start_time}'), {v_days_prior_to_extract}
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN t
            WHERE t.batch_id = '{v_batch_id}' AND t.pipeline_id = '{v_pipeline_id}' AND
            t.table_name = '{v_table_name}'
        """
        insert_hist_query = (insert_hist_query_str). \
            format(v_project_id=self.project_id,
                   v_pipeline_id=self.pipeline_id,
                   v_table_name=self.table_name,
                   v_pipeline_status="NEW",
                   v_pipeline_error="",
                   v_batch_run_id=self.batch_run_id,
                   v_pipeline_run_id=self.pipeline_run_id,
                   v_pipeline_start_time=self.start_time,
                   v_pipeline_end_time=None,
                   v_prune_mins=int(self. \
                       pipeline_config['prune_minutes']),
                   v_days_prior_to_extract=int(self. \
                       pipeline_config['days_prior_to_extract']),
                   v_batch_id=self.batch_id)

        self.plp_utils.execute_op_query(insert_hist_query)

    def execute_post_step(self):
        """
        Post step procedure for Pipeline.
        """
        self.pipeline_status = "SUCCESS"
        end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
        update_run_hist_str = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t
            SET t.pipeline_status = '{v_pipeline_status}',
            t.error_message  = '{v_pipeline_error}',
            t.pipeline_end_time = '{v_pipeline_end_time}',
            t.target_update_date = '{v_pipeline_end_time}'
            WHERE t.batch_id = '{v_batch_id}'
            AND t.pipeline_id = '{v_pipeline_id}'
            AND t.table_name = '{v_table_name}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.pipeline_run_id = {v_pipeline_run_id}
        """
        update_hist_query = (update_run_hist_str). \
            format(v_project_id=self.project_id,
                   v_pipeline_id=self.pipeline_id,
                   v_table_name=self.table_name,
                   v_pipeline_status=self.pipeline_status,
                   v_pipeline_error="",
                   v_batch_run_id=self.batch_run_id,
                   v_pipeline_run_id=self.pipeline_run_id,
                   v_batch_id=self.batch_id,
                   v_pipeline_end_time=end_time)

        self.plp_utils.execute_op_query(update_hist_query)

        update_run_str = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN t
            SET t.pipeline_status = '{v_pipeline_status}',
            t.pipeline_start_time = '{v_pipeline_start_time}',
            t.pipeline_end_time = '{v_pipeline_end_time}',
            t.target_update_date = '{v_pipeline_end_time}',
            t.last_extract_date = '{v_last_extract_date}'
            WHERE t.batch_id = '{v_batch_id}'
            AND t.pipeline_id = '{v_pipeline_id}'
            AND t.table_name = '{v_table_name}'
        """
        update_query = (update_run_str). \
            format(v_project_id=self.project_id,
                   v_pipeline_id=self.pipeline_id,
                   v_table_name=self.table_name,
                   v_pipeline_status=self.pipeline_status,
                   v_batch_id=self.batch_id,
                   v_pipeline_end_time=end_time,
                   v_pipeline_start_time=self.start_time,
                   v_last_extract_date=self.last_extract_date)

        self.plp_utils.execute_op_query(update_query)

        update_config_str = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PARAMETER t
            SET t.parameter_value = '{v_param_value}',
            t.target_update_date = '{v_target_update_date}'
            WHERE t.job_name = '{v_pipeline_id}'
            AND t.parameter_name = '{v_param_name}'
            AND EXISTS (SELECT 1 from {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H
                        WHERE batch_id = '{v_batch_id}'
                        AND pipeline_id = '{v_pipeline_id}'
                        AND table_name = '{v_table_name}'
                        AND batch_run_id = {v_batch_run_id}
                        AND pipeline_run_id = {v_pipeline_run_id})
        """

        update_query = (update_config_str). \
            format(v_project_id=self.project_id,
                   v_pipeline_id=self.pipeline_id,
                   v_param_name='last_extract_date',
                   v_param_value=self.start_time,
                   v_table_name=self.table_name,
                   v_batch_run_id=self.batch_run_id,
                   v_pipeline_run_id=self.pipeline_run_id,
                   v_batch_id=self.batch_id,
                   v_target_update_date=end_time)
        self.plp_utils.execute_op_query(update_query)

        update_query = (update_config_str). \
            format(v_project_id=self.project_id,
                   v_pipeline_id=self.pipeline_id,
                   v_param_name='pipeline_run_id',
                   v_param_value=self.pipeline_run_id,
                   v_table_name=self.table_name,
                   v_batch_run_id=self.batch_run_id,
                   v_pipeline_run_id=self.pipeline_run_id,
                   v_batch_id=self.batch_id,
                   v_target_update_date=end_time)
        self.plp_utils.execute_op_query(update_query)

    def load_aggregate_data(self):
        """
        Uploading aggregate table from view
        """
        # Initial load query for empty table
        str_initial_load_query = self.pipeline_config['initial_load_query']
        # Incremental query
        str_incremental_query = self.pipeline_config['incremental_query']
        # Name of schema were data view resides
        str_view_dataset_id = self.pipeline_config['view_dataset_id']
        stg_dataset_name = self.pipeline_config['stg_dataset_id']
        # Data view for target table
        str_data_view_name = self.pipeline_config['data_view_name']
        # Temporary table name
        stage_table = self.pipeline_config['stg_table_id']
        # Dataset of the input table
        dataset_id = self.pipeline_config['target_dataset_id']
        write_disposition = 'WRITE_TRUNCATE'
        initial_load_query = str_initial_load_query.format(
            self.project_id, str_view_dataset_id, str_data_view_name)
        stg_to_tgt_sql = """
            SELECT t.*, current_timestamp() as target_create_date,
                   current_timestamp() as target_update_date
            FROM {v_project_id}.{v_dataset_id}.{v_table_name} t
            WHERE snapshot_date = '{v_snapshot_date}'
        """
        if self.plp_utils.check_empty_table(self.project_id, dataset_id,
                                            self.table_name):
            self.plp_utils.load_query_to_bq_table(dataset_id, self.table_name,
                                                  initial_load_query,
                                                  write_disposition)
        else:
            self.last_extract_date.strftime("%Y-%m-%d %H:%M:%S.%f")
            days_prior_extract = int(
                self.pipeline_config['days_prior_to_extract'])
            list_snapshot_months = self.plp_utils. \
                get_months_between(self.last_extract_date \
                                   - timedelta(days=days_prior_extract))
            for snapshot_month in list_snapshot_months:
                print("Snapshot date: %s" % snapshot_month)
                incremental_query = str_incremental_query.format(
                    self.project_id, str_view_dataset_id, str_data_view_name,
                    snapshot_month)
                print("Incremental Query : %s" % incremental_query)
                self.plp_utils.load_query_to_bq_table(stg_dataset_name,
                                                      stage_table,
                                                      incremental_query)
                formatted_date = snapshot_month.strftime('%Y%m%d')
                target_partition_table = self.table_name + '$' + formatted_date
                has_no_null_records = self.pipeline_validator. \
                                          validate_not_null(
                                              stg_dataset_name,
                                              stage_table)
                has_no_duplicate_records = self.pipeline_validator. \
                                               validate_no_duplicates(
                                                   stg_dataset_name,
                                                   stage_table)
                stg_to_tgt_query = stg_to_tgt_sql.format(
                    v_project_id=self.project_id,
                    v_dataset_id=stg_dataset_name,
                    v_table_name=stage_table,
                    v_snapshot_date=snapshot_month)
                print(" Stage to Target Query : %s" % stg_to_tgt_query)
                if not has_no_null_records:
                    raise NullRecordsFoundException("Null records found in " +
                                                    stg_dataset_name + "." +
                                                    stage_table)
                if not has_no_duplicate_records:
                    raise DuplicateRecordsFoundException(
                        "Duplicate records found in " + stg_dataset_name +
                        "." + stage_table)
                if has_no_null_records and has_no_duplicate_records:
                    # load data to target table form stage table
                    self.plp_utils.load_query_to_bq_table(
                        dataset_id, target_partition_table, stg_to_tgt_query)

    def hive_to_bq_load(self):
        """
        Loading data from Hive to GCS
        """
        days_prior_extract = int(self.pipeline_config['days_prior_to_extract'])
        start_date = self.last_extract_date.date() - timedelta(
            days=days_prior_extract)
        end_date = datetime.strptime(self.start_time,
                                     '%Y-%m-%d %H:%M:%S.%f').date()
        try:
            gs_path = self.pipeline_config['gs_path']
            gcs_folder = self.pipeline_config['gcs_folder']
            gcs_file_name = self.pipeline_config['gcs_file_name']
            gcs_bucket_name = self.pipeline_config['gcs_bucket_name']
            # Initial Load Query
            str_initial_load_query = self.pipeline_config['initial_load_query']
            stage_dataset = self.pipeline_config['stage_dataset']
            stg_table_name = self.pipeline_config['stg_table_name']
            # Query to load data from staging to target table
            str_target_load_query = self.pipeline_config['target_load_query']
            target_load_query = str_target_load_query.format(self.project_id)
            write_disposition = 'WRITE_TRUNCATE'
            file_format = 'csv'
            hive_operations = HiveOperations()
            while start_date <= end_date:
                formatted_start_date = start_date.strftime('%Y%m%d')
                gcs_file_location = '{}/{}/{}/{}/{}_{}.{}'.format(
                    gs_path, gcs_folder,
                    str(formatted_start_date)[0:4],
                    str(formatted_start_date)[0:6], gcs_file_name,
                    formatted_start_date, file_format)
                prefix_file_location = '{}/{}/{}/'.format(
                    gcs_folder,
                    str(formatted_start_date)[0:4],
                    str(formatted_start_date)[0:6])
                processed_path = '{}/processed/{}/{}/'.format(
                    gcs_folder,
                    str(formatted_start_date)[0:4],
                    str(formatted_start_date)[0:6])
                hive_extract_query = str_initial_load_query.format(
                    formatted_start_date, formatted_start_date)
                hive_operations.extract_data_from_hive(hive_extract_query,
                                                       gcs_file_location)
                self.plp_utils.load_bq_table_from_gcs(
                    gcs_file_location, self.project_id, stage_dataset,
                    stg_table_name, write_disposition, file_format)
                self.plp_utils.execute_op_query(target_load_query)
                # moving gcs file to processed directory
                self.plp_utils.gcs_move_blob(gcs_bucket_name,
                                             prefix_file_location,
                                             processed_path)
                start_date += timedelta(days=1)

        except Exception as err_hive:
            print("Error in loading data from Hive to BQ")
            raise err_hive

    def execute_bq_query(self):
        """
        Method to execute a give SQL query. No output is rendered
        """
        pipeline_bq_query = self.pipeline_config["pipeline_bq_query"]. \
            format(v_project_id=self.project_id)
        self.plp_utils.execute_op_query(pipeline_bq_query)

    def execute_bq_load_query(self):
        """
        Method to execute a query and truncate load a table with out any SCD logic in place
        """
        str_pipeline_bq_load_query = self.pipeline_config[
            "pipeline_bq_load_query"]
        str_view_dataset_id = self.pipeline_config['view_dataset_id']
        str_data_view_name = self.pipeline_config['data_view_name']
        dataset_id = self.pipeline_config['target_dataset_id']
        write_disposition = 'WRITE_TRUNCATE'
        pipeline_bq_load_query = str_pipeline_bq_load_query.format(
            self.project_id, str_view_dataset_id, str_data_view_name)
        self.plp_utils.load_query_to_bq_table(dataset_id, self.table_name,
                                              pipeline_bq_load_query,
                                              write_disposition)

    def api_to_bq_load(self):
        """
        Method to load the API data to BQ
        """
        headers = {"X-AUTH-APIKEY": os.environ["DCIM_IBX_LIST_API_KEY"]}
        stage_dataset_name = str(
            self.pipeline_config['staging_dataset']).strip()
        stage_table_name = str(self.pipeline_config['stage_table'])
        dm_dataset_name = str(self.pipeline_config['dm_dataset']).strip()
        data_req = DataRequest(self.pipeline_config["get_ibxs_list_url"],
                               headers)
        data_req.make_get_req({"payLoad": "ibx_code"})
        add_col_n_vals = {
            "dcim_flag": "Y",
            "insert_date_time": self.start_time
        }
        data_req.load_api_res_to_bq(self.project_id, \
                                    stage_dataset_name + "." + stage_table_name, \
                                    "replace",
                                    add_col_n_vals)
        api_merge_query = self.pipeline_config["merge_dcim_plp_ibx"]. \
            format(v_project_id=self.project_id,
                   v_staging_dataset=stage_dataset_name,
                   v_dm_dataset=dm_dataset_name,
                   v_target_table=self.table_name,
                   v_stage_table=stage_table_name)
        self.plp_utils.execute_op_query(api_merge_query)

    def execute_pipeline(self, pipeline_type: str):
        """
        Function to pick and execute ETL Pipeline step.
        """
        print("Initiating pipeline....")
        print("- - - - - - - - - - - - -")
        print("Details: " + str(self))
        try:
            print("Executing pre step....")
            self.execute_pre_step()
            print("Pre step complete....executing pipeline step")
            if pipeline_type == 'hive_etl':
                self.hive_to_bq_load()
            elif pipeline_type == 'api_etl':
                self.api_to_bq_load()
            elif pipeline_type == 'aggregate_etl':
                self.load_aggregate_data()
            elif pipeline_type == 'query_etl':
                self.execute_bq_query()
            elif pipeline_type == 'query_load_etl':
                self.execute_bq_load_query()
            print("Pipeline step complete..... Executing post step")
            self.execute_post_step()
            print("Post step complete..... Executing pipeline validations")
            print("- - - - - - - - - - - - -")
            self.pipeline_validator.validate_pipeline_status()
            print("- - - - - - - - - - - - -")
            self.pipeline_validator.validate_start_end_time()
            print("- - - - - - - - - - - - -")
            print("Pipeline validations complete.")
        except Exception as err_pipeline:
            print("Unable to execute pipeline")
            raise err_pipeline
        print("Pipeline execution complete !!")
        print("------------------------------")
class PipelineValidator():
    """
    Utility Class to augment a pipeline with all validations.
    """
    project_id = ""
    batch_id = ""
    pipeline_id = ""
    validation_id = "Unassigned"
    validation_status = "NEW"
    batch_run_id = None
    pipeline_run_id = None
    plp_util = None
    null_validator = None

    def __init__(self, project_id: str, batch_id: str, pipeline_id: str,
                 batch_run_id: int, pipeline_run_id: int):
        """
            Constructor
        """
        self.project_id = project_id
        self.batch_id = batch_id
        self.pipeline_id = pipeline_id
        self.batch_run_id = batch_run_id
        self.pipeline_run_id = pipeline_run_id
        self.plp_util = PLPUtil()

    def __str__(self):
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : " + self.batch_run_id \
            + "', Pipeline ID : '" + self.pipeline_id \
            + "', Pipeline run ID : '" + str(self.pipeline_run_id) \
            + "', Validation ID : '" + self.validation_id \
            + "', Validation Status : '" + self.validation_status +"']"

    def __repr__(self):
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : " + self.batch_run_id \
            + "', Pipeline ID : '" + self.pipeline_id \
            + "', Pipeline run ID : '" + str(self.pipeline_run_id) \
            + "', Validation ID : '" + self.validation_id \
            + "', Validation Status : '" + self.validation_status +"']"

    def validate_pipeline_status(self):
        """
        Validate Pipeline was success
        """
        self.validation_id = self.batch_id + "__" \
                        + self.pipeline_id + "__validate_status"
        self.validation_status = "FAILED"
        query_pipeline_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.pipeline_id = '{v_pipeline_id}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.pipeline_run_id = {v_pipeline_run_id}
            AND t.pipeline_status = 'SUCCESS'
        """
        query_pipeline_run = (query_pipeline_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_pipeline_id=self.pipeline_id,
                   v_batch_run_id=self.batch_run_id,
                   v_pipeline_run_id=self.pipeline_run_id)

        if self.plp_util.execute_query(query_pipeline_run).size > 0:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

    def validate_start_end_time(self):
        """
        Validate Start and End date for the pipeline
        """
        self.validation_id = self.batch_id + "__" \
                        + self.pipeline_id + "__validate_start_end_time"
        self.validation_status = "FAILED"
        query_pipeline_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN_H t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.pipeline_id = '{v_pipeline_id}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.pipeline_run_id = {v_pipeline_run_id}
            AND t.pipeline_end_time > t.pipeline_start_time
        """
        query_pipeline_run = (query_pipeline_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_pipeline_id=self.pipeline_id,
                   v_batch_run_id=self.batch_run_id,
                   v_pipeline_run_id=self.pipeline_run_id)

        if self.plp_util.execute_query(query_pipeline_run).size > 0:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

    def update_validation_status(self):
        """
        Method to update validation status
        """
        curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
        update_status_qry_str_1 = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_VALIDATIONS t
            SET t.validation_status = '{v_validation_status}',
            t.batch_run_id = {v_batch_run_id},
            t.pipeline_run_id = {v_pipeline_run_id},
            t.last_validation_datetime = '{v_validation_datetime}',
            t.target_update_date  = '{v_validation_datetime}',
        """
        if self.validation_status == "SUCCESS":
            update_status_qry_str_2 = """
            t.last_success_datetime = '{v_validation_datetime}',
            t.last_success_batch_run_id = {v_batch_run_id},
            t.last_success_pipeline_run_id = {v_pipeline_run_id}
        """
        else:
            update_status_qry_str_2 = """
            t.last_failure_datetime = '{v_validation_datetime}',
            t.last_failure_batch_run_id = {v_batch_run_id},
            t.last_failure_pipeline_run_id = {v_pipeline_run_id}
        """

        update_status_qry_str_3 = "WHERE t.validation_id = '{v_validation_id}'"

        update_status_qry_str = update_status_qry_str_1 \
                                + update_status_qry_str_2 \
                                + update_status_qry_str_3

        update_status_qry = (update_status_qry_str).format(
            v_project_id=self.project_id,
            v_validation_status=self.validation_status,
            v_validation_datetime=curr_time,
            v_batch_run_id=self.batch_run_id,
            v_pipeline_run_id=self.pipeline_run_id,
            v_validation_id=self.validation_id)

        self.plp_util.execute_op_query(update_status_qry)

    def validate_not_null(self, dataset_id: str, table_name: str) -> bool:
        """
        Method to validate there are no null values in integration id column
        """
        self.validation_id = self.batch_id + "__" \
                        + self.pipeline_id + "__validate_not_null"
        self.validation_status = "FAILED"

        str_null_check_query = """
            SELECT COUNT(1) as null_count FROM {v_project_id}.{v_dataset}.{v_table_name}
            WHERE integration_id is null
        """

        null_check_query = str_null_check_query.format(
            v_project_id=self.project_id,
            v_dataset=dataset_id,
            v_table_name=table_name)

        null_check = self.plp_util.execute_query(null_check_query)
        has_no_null_records = null_check['null_count'].values[0] == 0

        if has_no_null_records:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

        return has_no_null_records

    def validate_no_duplicates(self, dataset_id: str, table_name: str) -> bool:
        """
        Method to vaidate there are no duplicates in the integration id column
        """
        self.validation_id = self.batch_id + "__" \
                        + self.pipeline_id + "__validate_no_duplicates"
        self.validation_status = "FAILED"
        str_query_check_no_duplicates = """
            SELECT count(1)
            FROM {v_project_id}.{v_dataset}.{v_table_name}
            GROUP BY integration_id having count(*) > 1
        """
        query_check_no_duplicates = \
            str_query_check_no_duplicates.format(v_project_id=self.project_id,
                                                 v_dataset=dataset_id,
                                                 v_table_name=table_name)

        has_no_duplicate = self.plp_util.execute_query(
            query_check_no_duplicates).size == 0

        if has_no_duplicate:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

        return has_no_duplicate
예제 #8
0
class BatchManager():
    """
    Batch Manager manages each batch as a single entity
    across all states.
    """
    project_id = ""
    batch_id = ""
    batch_run_id = 0
    start_time = ""

    plp_config = None
    plp_utils = None
    batch_status = "FAILED"
    batch_validator = None
    related_pipelines = []

    def __init__(self, project_id: str, batch_id: str):
        """
        Batch Constructor
        """
        self.project_id = project_id
        self.batch_id = batch_id
        self.plp_utils = PLPUtil()
        self.plp_config = PLPConfiguration(project_id, batch_id)
        self.batch_config = self.plp_config.get_all_params()
        self.batch_run_id = int(self.batch_config['batch_run_id']) + 1
        self.batch_validator = BatchValidator(project_id, batch_id,
                                              self.batch_run_id)
        self.start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
        self.related_pipelines = self.batch_config['related_pipelines'].split(
            ",")

    def __str__(self) -> str:
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : " + str(self.batch_run_id) + "]"

    def __repr__(self) -> str:
        """
        String representation of the class
        """
        return "[Project ID: '" + self.project_id \
            + "', Batch : '" + self.batch_id \
            + "', Batch run ID : " + self.batch_run_id + "]"

    def init_batch(self):
        """
        Init Step for Batch
        """
        update_run_str = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN t
            SET t.batch_start_time = '{v_batch_start_time}'
            WHERE t.batch_id = '{v_batch_id}'
        """
        update_query = (update_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_start_time=self.start_time)

        self.plp_utils.execute_op_query(update_query)

        insert_hist_query_str = """
            INSERT INTO {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H
            SELECT '{v_batch_id}', {v_batch_run_id}, t.batch_name, '{v_batch_status}',
            TIMESTAMP('{v_batch_start_time}'), safe.timestamp('1900-01-01 00:00:00'),
            TIMESTAMP('{v_batch_start_time}'), TIMESTAMP('{v_batch_start_time}')
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN t
            WHERE t.batch_id = '{v_batch_id}'
        """
        insert_hist_query = (insert_hist_query_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_run_id=self.batch_run_id,
                   v_batch_status="NEW",
                   v_batch_start_time=self.start_time)

        self.plp_utils.execute_op_query(insert_hist_query)

    def finalize_batch(self):
        """
        Finalize the Batch.
        """
        end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
        update_run_hist_str = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t
            SET t.batch_status = '{v_batch_status}',
            t.batch_end_time = '{v_batch_end_time}',
            t.target_update_date = '{v_batch_end_time}'
            WHERE t.batch_id = '{v_batch_id}'
            AND t.batch_run_id = {v_batch_run_id}
        """
        update_hist_query = (update_run_hist_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_run_id=self.batch_run_id,
                   v_batch_status=self.batch_status,
                   v_batch_end_time=end_time)

        self.plp_utils.execute_op_query(update_hist_query)

        update_run_str = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN t
            SET t.batch_status = '{v_batch_status}',
            t.batch_end_time = '{v_batch_end_time}',
            t.target_update_date = '{v_batch_end_time}'
            WHERE t.batch_id = '{v_batch_id}'
        """
        update_query = (update_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_status=self.batch_status,
                   v_batch_end_time=end_time)

        self.plp_utils.execute_op_query(update_query)

        update_config_str = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PARAMETER t
            SET t.parameter_value = '{v_param_value}',
            t.target_update_date = '{v_target_update_date}'
            WHERE t.job_name = '{v_batch_id}'
            AND t.parameter_name = '{v_param_name}'
            AND EXISTS (SELECT 1 from {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H
                        WHERE batch_id = '{v_batch_id}'
                        AND batch_run_id = {v_batch_run_id})
        """

        update_query = (update_config_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_run_id=self.batch_run_id,
                   v_param_name='batch_run_id',
                   v_param_value=self.batch_run_id,
                   v_target_update_date=end_time)
        self.plp_utils.execute_op_query(update_query)

    def execute_batch(self, batch_state: str):
        """
        Method to execute the batch based on input batch state.
        """
        print("Initiating batch....")
        print("- - - - - - - - - - - - -")
        print("Details: " + str(self))
        if batch_state == "BEGIN":
            print("Executing batch pre step....")
            self.init_batch()
            print("Batch pre step execution completed.")
            print("Batch initialization complete")
        elif batch_state == "END":
            print("Executing batch post step....")
            print("- - - - - - - - - - - - -")
            if self.batch_validator.validate_pipelines(self.related_pipelines):
                self.batch_status = "SUCCESS"
            else:
                raise RelatedPipelinesFailedException(
                    "Related pipelined in [" + self.related_pipelines +
                    "] failed")
            print("- - - - - - - - - - - - -")
            self.finalize_batch()
            print("Batch Post step complete..... Executing batch validations")
            print("- - - - - - - - - - - - -")
            self.batch_validator.validate_start_end_time()
            print("- - - - - - - - - - - - -")
            self.batch_validator.validate_batch_status()
            print("- - - - - - - - - - - - -")
            print("Batch validations complete.")
            print("Batch execution complete !!")
        print("==============================")
class BatchValidator():
    """
    Utility Class to augment a batch with all validations.
    """
    project_id = ""
    batch_id = ""
    validation_id = "Unassigned"
    validation_status = "NEW"
    batch_run_id = None
    plp_util = None

    def __init__(self, project_id: str, batch_id: str,
                 batch_run_id: int):
        """
            Constructor
        """
        self.project_id = project_id
        self.batch_id = batch_id
        self.batch_run_id = batch_run_id
        self.plp_util = PLPUtil()

    def __str__(self):
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : '" + str(self.batch_run_id) \
            + "', Validation ID : '" + self.validation_id \
            + "', Validation Status : '" + self.validation_status +"']"

    def __repr__(self):
        """
        String representation of the class
        """
        return "[Project ID : '" + self.project_id \
            + "', Batch ID : '" + self.batch_id \
            + "', Batch run ID : '" + str(self.batch_run_id) \
            + "', Validation ID : '" + self.validation_id \
            + "', Validation Status : '" + self.validation_status +"']"

    def validate_batch_status(self):
        """
        Validate Batch was success
        """
        self.validation_id = self.batch_id + "__validate_status"
        self.validation_status = "FAILED"
        query_batch_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.batch_status = 'SUCCESS'
        """
        query_batch_run = (query_batch_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_run_id=self.batch_run_id)

        if self.plp_util.execute_query(query_batch_run).size > 0:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

    def validate_pipelines(self, list_str_pipeline_ids: []):
        """
        Method to verify that the given pipelines are successful.
        """
        self.validation_id = self.batch_id + "__validate_related_pipelines"

        for str_pipeline_id in list_str_pipeline_ids:
            pipeline_is_success = self.verify_pipeline_success(str_pipeline_id.strip())
            if pipeline_is_success:
                self.validation_status = "SUCCESS"
                pipeline_is_success = True
            else:
                self.validation_status = "FAILED"
                pipeline_is_success = False
                break

        print("Validation :" + str(self))
        self.update_validation_status()

        return pipeline_is_success

    def verify_pipeline_success(self, pipeline_id: str):
        """
        Verify give pipeline was successful
        """
        query_pipeline_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_PIPELINE_RUN t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.pipeline_id = '{v_pipeline_id}'
            AND t.pipeline_status = 'SUCCESS'
        """
        query_pipeline_run = (query_pipeline_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_pipeline_id=pipeline_id)

        return self.plp_util.execute_query(query_pipeline_run).size > 0

    def validate_start_end_time(self):
        """
        Validate Start and End date for the batch
        """
        self.validation_id = self.batch_id + "__validate_start_end_time"
        self.validation_status = "FAILED"
        query_batch_run_str = """
            SELECT COUNT(1)
            FROM {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H t
            WHERE t.batch_id = '{v_batch_id}'
            AND t.batch_run_id = {v_batch_run_id}
            AND t.batch_end_time > t.batch_start_time
        """
        query_batch_run = (query_batch_run_str). \
            format(v_project_id=self.project_id,
                   v_batch_id=self.batch_id,
                   v_batch_run_id=self.batch_run_id)

        if self.plp_util.execute_query(query_batch_run).size > 0:
            self.validation_status = "SUCCESS"

        print("Validation :" + str(self))
        self.update_validation_status()

    def update_validation_status(self):
        """
        Method to update validation status
        """
        curr_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
        update_status_qry_str_1 = """
            UPDATE {v_project_id}.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_VALIDATIONS t
            SET t.validation_status = '{v_validation_status}',
            t.batch_run_id = {v_batch_run_id},
            t.last_validation_datetime = '{v_validation_datetime}',
            t.target_update_date  = '{v_validation_datetime}',
        """
        if self.validation_status == "SUCCESS":
            update_status_qry_str_2 = """
            t.last_success_datetime = '{v_validation_datetime}',
            t.last_success_batch_run_id = {v_batch_run_id}
        """
        else:
            update_status_qry_str_2 = """
            t.last_failure_datetime = '{v_validation_datetime}',
            t.last_failure_batch_run_id = {v_batch_run_id}
        """

        update_status_qry_str_3 = "WHERE t.validation_id = '{v_validation_id}'"

        update_status_qry_str = update_status_qry_str_1 \
                                + update_status_qry_str_2 \
                                + update_status_qry_str_3

        update_status_qry = (update_status_qry_str).format(
            v_project_id=self.project_id,
            v_validation_status=self.validation_status,
            v_validation_datetime=curr_time,
            v_batch_run_id=self.batch_run_id,
            v_validation_id=self.validation_id)

        self.plp_util.execute_op_query(update_status_qry)