def _transfer_bigquery_to_gcs(dag, task_id) -> models.BaseOperator: """Pipeline to transfer finally transferable output to GCS. Args: dag: the DAG to add this operator to task_id: ID for this specific task within the DAG. Returns: Operator to use within a DAG to run the Pipeline for moving records to GCS. """ storage_vars = airflow_utils.retrieve_airflow_variable_as_dict( blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG) final_output_uri = '{path}/result-{timestamp}-*.json'.format( path=storage_vars['gcs_output_path'], timestamp=int(time.time())) final_output_table = '{project}.{dataset}.final_output'.format( project=storage_vars['bq_working_project'], dataset=storage_vars['bq_working_dataset']) return bigquery_to_gcs.BigQueryToGCSOperator( task_id=task_id, source_project_dataset_table=final_output_table, destination_cloud_storage_uris=[final_output_uri], export_format='NEWLINE_DELIMITED_JSON', dag=dag)
# Get the table list from master file all_records = read_table_list(table_list_file_path) # Loop over each record in the 'all_records' python list to build up # Airflow tasks for record in all_records: logger.info('Generating tasks to transfer table: {}'.format(record)) table_source = record['table_source'] table_dest = record['table_dest'] BQ_to_GCS = bigquery_to_gcs.BigQueryToGCSOperator( # Replace ":" with valid character for Airflow task task_id='{}_BQ_to_GCS'.format(table_source.replace(":", "_")), source_project_dataset_table=table_source, destination_cloud_storage_uris=[ '{}-*.avro'.format('gs://' + source_bucket + '/' + table_source) ], export_format='AVRO') GCS_to_GCS = gcs_to_gcs.GCSToGCSOperator( # Replace ":" with valid character for Airflow task task_id='{}_GCS_to_GCS'.format(table_source.replace(":", "_")), source_bucket=source_bucket, source_object='{}-*.avro'.format(table_source), destination_bucket=dest_bucket, # destination_object='{}-*.avro'.format(table_dest) ) GCS_to_BQ = gcs_to_bigquery.GCSToBigQueryOperator(
"useLegacySql": False, "destinationTable": { "projectId": project_id, "datasetId": bq_dataset_name, "tableId": bq_recent_questions_table_id } } }, location=location, ) # [END composer_bigquery] # Export query result to Cloud Storage. export_questions_to_gcs = bigquery_to_gcs.BigQueryToGCSOperator( task_id='export_recent_questions_to_gcs', source_project_dataset_table= f"{project_id}.{bq_dataset_name}.{bq_recent_questions_table_id}", destination_cloud_storage_uris=[output_file], export_format='CSV') # Perform most popular question query. bq_most_popular_query = bigquery.BigQueryInsertJobOperator( task_id="bq_most_popular_question_query", configuration={ "query": { "query": MOST_POPULAR_QUERY, "useLegacySql": False, "destinationTable": { "projectId": project_id, "datasetId": bq_dataset_name, "tableId": bq_most_popular_table_id }