示例#1
0
 def get_config_details(dataset_name, table_name, etl_project_id):
     """
     Getting the configuration details from control table
     """
     incr_query = ConnectBqCtlTable.get_incr_query(dataset_name, table_name,
                                                   etl_project_id)
     query_job = (incr_query).format(dataset_name, table_name,
                                     etl_project_id)
     job = ExecutePipeline.client.query(query_job)
     result = job.result()
     param_dataframe = ExecutePipeline.client.query(
         query_job).to_dataframe()
     batch_name = dataset_name
     table_name = param_dataframe['table_name'].values[0]
     query = param_dataframe['query'].values[0]
     gs_path = param_dataframe['gs_path'].values[0]
     gcs_folder = param_dataframe['gcs_folder'].values[0]
     gcs_file_name = param_dataframe['gcs_file_name'].values[0]
     strg_project_id = param_dataframe['strg_project_id'].values[0]
     bq_project_id = param_dataframe['bq_project_id'].values[0]
     final_schema = param_dataframe['final_schema'].values[0]
     stg_schema = param_dataframe['stg_schema'].values[0]
     prune_min = int(param_dataframe['prune_minutes'].values[0])
     bq_stg_table = param_dataframe['bq_stg_table'].values[0]
     pipeline_run_id = param_dataframe['pipeline_run_id'].values[0]
     last_extract_date = datetime.datetime.strptime(
         str(param_dataframe['last_extract_date'].values[0]),
         '%Y-%m-%dT%H:%M:%S.%f000')
     start_date = datetime.datetime.strftime(
         datetime.datetime.strptime(
             str(param_dataframe['last_extract_date'].values[0]),
             '%Y-%m-%dT%H:%M:%S.%f000'), '%Y%m%d')
     end_date = datetime.datetime.strftime(datetime.date.today(), '%Y%m%d')
     return batch_name, table_name, query, start_date, end_date, gs_path, gcs_folder, gcs_file_name, last_extract_date, bq_stg_table, strg_project_id, bq_project_id, final_schema, stg_schema, prune_min, pipeline_run_id
 def load_job_run_tbl(etl_project_id):
     """
     Loading the final control table
     """
     merge_query = ConnectBqCtlTable.get_batch_merge_query(
         '{}'.format(etl_project_id))
     query_job = (merge_query).format(etl_project_id, ExecuteBatch.batch_id)
     job = ExecuteBatch.client.query(query_job)
     job.result()
示例#3
0
 def load_hist_update_pipeline(self):
     param = self.extract_update_param()
     upd_query = ConnectBqCtlTable.get_agg_update_query(
         self.project_id, self.dataset)
     for job_name, pipeline_id in param[['job_name', 'pipeline_id'
                                         ]].itertuples(index=False):
         job = upd_query.format(pipeline_id, self.project_id)
         job_load = client.query(job)
         job_load.result()
         print("updated to table %s" % pipeline_id)
 def load_hist_job_run_tbl(etl_project_id):
     """
     Loading the history control table basedon parameters
     """
     insert_query = ConnectBqCtlTable.get_batch_insert_query(
         '{}'.format(etl_project_id))
     query_job = (insert_query).format(
         etl_project_id, ExecuteBatch.batch_id)  ##need to check
     job = ExecuteBatch.client.query(query_job)
     job.result()
示例#5
0
 def load_hist_job_run_tbl_upd(etl_project_id, state):
     """
     Loading the history control table basedon parameters
     """
     update_query = ConnectBqCtlTable.get_batch_update_query(
         '{}'.format(etl_project_id))
     query_job = (update_query).format(etl_project_id,
                                       ExecuteBatch.batch_id, state)
     job = ExecuteBatch.client.query(query_job)
     job.result()
示例#6
0
    def bq_load_final(batch_name, table, bq_project_id):
        """
        method is used to load data to bq_final table

        """
        final_target_merge_query = ConnectBqCtlTable.get_final_target_merge_query(
            batch_name, table, bq_project_id)
        client = bigquery.Client()
        query_job = (final_target_merge_query).format(bq_project_id)
        job = client.query(query_job)
        result = job.result()
示例#7
0
 def load_job_run_tbl(job_id, batch_name, table_name, etl_project_id):
     """
     Loading the final control table
     """
     merge_query = ConnectBqCtlTable.get_merge_query(
         '{}'.format(batch_name), '{}'.format(table_name),
         '{}'.format(etl_project_id))
     query_job = (merge_query).format(etl_project_id, batch_name,
                                      table_name)
     job = ExecutePipeline.client.query(query_job)
     job.result()
示例#8
0
 def load_hist_job_run_tbl(job_id, batch_name, table_name, table,
                           batch_start_time, prune_min, pipeline_run_id,
                           etl_project_id, batch_run_id):
     """
     Loading the history control table basedon parameters
     """
     insert_query = ConnectBqCtlTable.get_insert_query(
         '{}'.format(batch_name), '{}'.format(table_name),
         '{}'.format(etl_project_id))
     query_job = (insert_query).format(etl_project_id, job_id, batch_name,
                                       table, batch_start_time, prune_min,
                                       pipeline_run_id, batch_run_id)
     job = ExecutePipeline.client.query(query_job)
     job.result()
示例#9
0
 def load_hist_pipeline_job(self):
     params = self.extract_ins_param()
     query = ConnectBqCtlTable.get_agg_insert_query(self.project_id,
                                                    self.dataset)
     client = bigquery.Client()
     for job_name, table_name, pipeline_id, prune_min, job_name_id in params[
         [
             'job_name', 'table_name', 'pipeline_id', 'prune_min',
             'job_name_id'
         ]].itertuples(index=False):
         job = query.format(pipeline_id, job_name_id, table_name,
                            self.project_id, self.batch_run_id)
         job_load = client.query(job)
         job_load.result()
         print("inserted to table %s" % pipeline_id)
示例#10
0
 def load_hist_job_run_tbl_upd(job_id, batch_name, table_name, state,
                               last_extract_date, batch_start_time,
                               batch_end_time, etl_project_id,
                               batch_run_id):
     """
     Loading the history control table basedon parameters
     """
     update_query = ConnectBqCtlTable.get_update_query(
         '{}'.format(batch_name), '{}'.format(table_name),
         '{}'.format(etl_project_id))
     query_job = (update_query).format(etl_project_id, batch_name,
                                       table_name, last_extract_date,
                                       batch_start_time, batch_end_time,
                                       state, batch_run_id)
     job = ExecutePipeline.client.query(query_job)
     job.result()
示例#11
0
 def load_hist_job_run_tbl(etl_project_id):
     """
     Loading the history control table basedon parameters
     """
     insert_query = ConnectBqCtlTable.get_batch_insert_query(
         '{}'.format(etl_project_id))
     query_job = (insert_query).format(
         etl_project_id, ExecuteBatch.batch_id)  ##need to check
     job = ExecuteBatch.client.query(query_job)
     job.result()
     query = 'select max(batch_run_id) as batch_run_id, batch_name  from `analytics-plp-uat.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H` where batch_name = \'BCM_DCIM\' group by batch_name'
     job = ExecuteBatch.client.query(query)
     results = job.result()
     for row in results:
         print("batch_name : ", row.batch_name)
         print("batch_run_id : ", row.batch_run_id)