示例#1
0
def _transfer_bigquery_to_gcs(dag, task_id) -> models.BaseOperator:
  """Pipeline to transfer finally transferable output to GCS.

  Args:
    dag: the DAG to add this operator to
    task_id: ID for this specific task within the DAG.

  Returns:
    Operator to use within a DAG to run the Pipeline for moving records to GCS.
  """
  storage_vars = airflow_utils.retrieve_airflow_variable_as_dict(
      blockbuster_constants.BLOCKBUSTER_STORAGE_CONFIG)

  final_output_uri = '{path}/result-{timestamp}-*.json'.format(
      path=storage_vars['gcs_output_path'],
      timestamp=int(time.time()))

  final_output_table = '{project}.{dataset}.final_output'.format(
      project=storage_vars['bq_working_project'],
      dataset=storage_vars['bq_working_dataset'])

  return bigquery_to_gcs.BigQueryToGCSOperator(
      task_id=task_id,
      source_project_dataset_table=final_output_table,
      destination_cloud_storage_uris=[final_output_uri],
      export_format='NEWLINE_DELIMITED_JSON',
      dag=dag)
示例#2
0
    # Get the table list from master file
    all_records = read_table_list(table_list_file_path)

    # Loop over each record in the 'all_records' python list to build up
    # Airflow tasks
    for record in all_records:
        logger.info('Generating tasks to transfer table: {}'.format(record))

        table_source = record['table_source']
        table_dest = record['table_dest']

        BQ_to_GCS = bigquery_to_gcs.BigQueryToGCSOperator(
            # Replace ":" with valid character for Airflow task
            task_id='{}_BQ_to_GCS'.format(table_source.replace(":", "_")),
            source_project_dataset_table=table_source,
            destination_cloud_storage_uris=[
                '{}-*.avro'.format('gs://' + source_bucket + '/' +
                                   table_source)
            ],
            export_format='AVRO')

        GCS_to_GCS = gcs_to_gcs.GCSToGCSOperator(
            # Replace ":" with valid character for Airflow task
            task_id='{}_GCS_to_GCS'.format(table_source.replace(":", "_")),
            source_bucket=source_bucket,
            source_object='{}-*.avro'.format(table_source),
            destination_bucket=dest_bucket,
            # destination_object='{}-*.avro'.format(table_dest)
        )

        GCS_to_BQ = gcs_to_bigquery.GCSToBigQueryOperator(
示例#3
0
                "useLegacySql": False,
                "destinationTable": {
                    "projectId": project_id,
                    "datasetId": bq_dataset_name,
                    "tableId": bq_recent_questions_table_id
                }
            }
        },
        location=location,
    )
    # [END composer_bigquery]

    # Export query result to Cloud Storage.
    export_questions_to_gcs = bigquery_to_gcs.BigQueryToGCSOperator(
        task_id='export_recent_questions_to_gcs',
        source_project_dataset_table=
        f"{project_id}.{bq_dataset_name}.{bq_recent_questions_table_id}",
        destination_cloud_storage_uris=[output_file],
        export_format='CSV')

    # Perform most popular question query.
    bq_most_popular_query = bigquery.BigQueryInsertJobOperator(
        task_id="bq_most_popular_question_query",
        configuration={
            "query": {
                "query": MOST_POPULAR_QUERY,
                "useLegacySql": False,
                "destinationTable": {
                    "projectId": project_id,
                    "datasetId": bq_dataset_name,
                    "tableId": bq_most_popular_table_id
                }