def get_config_details(dataset_name, table_name, etl_project_id): """ Getting the configuration details from control table """ incr_query = ConnectBqCtlTable.get_incr_query(dataset_name, table_name, etl_project_id) query_job = (incr_query).format(dataset_name, table_name, etl_project_id) job = ExecutePipeline.client.query(query_job) result = job.result() param_dataframe = ExecutePipeline.client.query( query_job).to_dataframe() batch_name = dataset_name table_name = param_dataframe['table_name'].values[0] query = param_dataframe['query'].values[0] gs_path = param_dataframe['gs_path'].values[0] gcs_folder = param_dataframe['gcs_folder'].values[0] gcs_file_name = param_dataframe['gcs_file_name'].values[0] strg_project_id = param_dataframe['strg_project_id'].values[0] bq_project_id = param_dataframe['bq_project_id'].values[0] final_schema = param_dataframe['final_schema'].values[0] stg_schema = param_dataframe['stg_schema'].values[0] prune_min = int(param_dataframe['prune_minutes'].values[0]) bq_stg_table = param_dataframe['bq_stg_table'].values[0] pipeline_run_id = param_dataframe['pipeline_run_id'].values[0] last_extract_date = datetime.datetime.strptime( str(param_dataframe['last_extract_date'].values[0]), '%Y-%m-%dT%H:%M:%S.%f000') start_date = datetime.datetime.strftime( datetime.datetime.strptime( str(param_dataframe['last_extract_date'].values[0]), '%Y-%m-%dT%H:%M:%S.%f000'), '%Y%m%d') end_date = datetime.datetime.strftime(datetime.date.today(), '%Y%m%d') return batch_name, table_name, query, start_date, end_date, gs_path, gcs_folder, gcs_file_name, last_extract_date, bq_stg_table, strg_project_id, bq_project_id, final_schema, stg_schema, prune_min, pipeline_run_id
def load_job_run_tbl(etl_project_id): """ Loading the final control table """ merge_query = ConnectBqCtlTable.get_batch_merge_query( '{}'.format(etl_project_id)) query_job = (merge_query).format(etl_project_id, ExecuteBatch.batch_id) job = ExecuteBatch.client.query(query_job) job.result()
def load_hist_update_pipeline(self): param = self.extract_update_param() upd_query = ConnectBqCtlTable.get_agg_update_query( self.project_id, self.dataset) for job_name, pipeline_id in param[['job_name', 'pipeline_id' ]].itertuples(index=False): job = upd_query.format(pipeline_id, self.project_id) job_load = client.query(job) job_load.result() print("updated to table %s" % pipeline_id)
def load_hist_job_run_tbl(etl_project_id): """ Loading the history control table basedon parameters """ insert_query = ConnectBqCtlTable.get_batch_insert_query( '{}'.format(etl_project_id)) query_job = (insert_query).format( etl_project_id, ExecuteBatch.batch_id) ##need to check job = ExecuteBatch.client.query(query_job) job.result()
def load_hist_job_run_tbl_upd(etl_project_id, state): """ Loading the history control table basedon parameters """ update_query = ConnectBqCtlTable.get_batch_update_query( '{}'.format(etl_project_id)) query_job = (update_query).format(etl_project_id, ExecuteBatch.batch_id, state) job = ExecuteBatch.client.query(query_job) job.result()
def bq_load_final(batch_name, table, bq_project_id): """ method is used to load data to bq_final table """ final_target_merge_query = ConnectBqCtlTable.get_final_target_merge_query( batch_name, table, bq_project_id) client = bigquery.Client() query_job = (final_target_merge_query).format(bq_project_id) job = client.query(query_job) result = job.result()
def load_job_run_tbl(job_id, batch_name, table_name, etl_project_id): """ Loading the final control table """ merge_query = ConnectBqCtlTable.get_merge_query( '{}'.format(batch_name), '{}'.format(table_name), '{}'.format(etl_project_id)) query_job = (merge_query).format(etl_project_id, batch_name, table_name) job = ExecutePipeline.client.query(query_job) job.result()
def load_hist_job_run_tbl(job_id, batch_name, table_name, table, batch_start_time, prune_min, pipeline_run_id, etl_project_id, batch_run_id): """ Loading the history control table basedon parameters """ insert_query = ConnectBqCtlTable.get_insert_query( '{}'.format(batch_name), '{}'.format(table_name), '{}'.format(etl_project_id)) query_job = (insert_query).format(etl_project_id, job_id, batch_name, table, batch_start_time, prune_min, pipeline_run_id, batch_run_id) job = ExecutePipeline.client.query(query_job) job.result()
def load_hist_pipeline_job(self): params = self.extract_ins_param() query = ConnectBqCtlTable.get_agg_insert_query(self.project_id, self.dataset) client = bigquery.Client() for job_name, table_name, pipeline_id, prune_min, job_name_id in params[ [ 'job_name', 'table_name', 'pipeline_id', 'prune_min', 'job_name_id' ]].itertuples(index=False): job = query.format(pipeline_id, job_name_id, table_name, self.project_id, self.batch_run_id) job_load = client.query(job) job_load.result() print("inserted to table %s" % pipeline_id)
def load_hist_job_run_tbl_upd(job_id, batch_name, table_name, state, last_extract_date, batch_start_time, batch_end_time, etl_project_id, batch_run_id): """ Loading the history control table basedon parameters """ update_query = ConnectBqCtlTable.get_update_query( '{}'.format(batch_name), '{}'.format(table_name), '{}'.format(etl_project_id)) query_job = (update_query).format(etl_project_id, batch_name, table_name, last_extract_date, batch_start_time, batch_end_time, state, batch_run_id) job = ExecutePipeline.client.query(query_job) job.result()
def load_hist_job_run_tbl(etl_project_id): """ Loading the history control table basedon parameters """ insert_query = ConnectBqCtlTable.get_batch_insert_query( '{}'.format(etl_project_id)) query_job = (insert_query).format( etl_project_id, ExecuteBatch.batch_id) ##need to check job = ExecuteBatch.client.query(query_job) job.result() query = 'select max(batch_run_id) as batch_run_id, batch_name from `analytics-plp-uat.PLP_BQ_CTL_METADATA.PLP_BQ_CTL_BATCH_RUN_H` where batch_name = \'BCM_DCIM\' group by batch_name' job = ExecuteBatch.client.query(query) results = job.result() for row in results: print("batch_name : ", row.batch_name) print("batch_run_id : ", row.batch_run_id)