def test_execute(self, mock_hook): source_project_dataset_table = '{}.{}'.format( TEST_DATASET, TEST_TABLE_ID) destination_cloud_storage_uris = ['gs://some-bucket/some-file.txt'] compression = 'NONE' export_format = 'CSV' field_delimiter = ',' print_header = True labels = {'k1': 'v1'} operator = BigQueryToCloudStorageOperator( task_id=TASK_ID, source_project_dataset_table=source_project_dataset_table, destination_cloud_storage_uris=destination_cloud_storage_uris, compression=compression, export_format=export_format, field_delimiter=field_delimiter, print_header=print_header, labels=labels ) operator.execute(None) mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_extract \ .assert_called_once_with( source_project_dataset_table=source_project_dataset_table, destination_cloud_storage_uris=destination_cloud_storage_uris, compression=compression, export_format=export_format, field_delimiter=field_delimiter, print_header=print_header, labels=labels )
def test_execute(self, mock_hook): source_project_dataset_table = '{}.{}'.format(TEST_DATASET, TEST_TABLE_ID) destination_cloud_storage_uris = ['gs://some-bucket/some-file.txt'] compression = 'NONE' export_format = 'CSV' field_delimiter = ',' print_header = True labels = {'k1': 'v1'} operator = BigQueryToCloudStorageOperator( task_id=TASK_ID, source_project_dataset_table=source_project_dataset_table, destination_cloud_storage_uris=destination_cloud_storage_uris, compression=compression, export_format=export_format, field_delimiter=field_delimiter, print_header=print_header, labels=labels) operator.execute(None) mock_hook.return_value \ .get_conn.return_value \ .cursor.return_value \ .run_extract \ .assert_called_once_with( source_project_dataset_table=source_project_dataset_table, destination_cloud_storage_uris=destination_cloud_storage_uris, compression=compression, export_format=export_format, field_delimiter=field_delimiter, print_header=print_header, labels=labels )
def extract_channel_subdag( parent_dag_name, child_dag_name, default_args, schedule_interval, dataset_id, channel, ): dag = DAG( dag_id="{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args, schedule_interval=schedule_interval, ) bq_extract_table = "glam_extract_firefox_{}_v1".format(channel) etl_query = bigquery_etl_query( task_id="glam_client_probe_counts_{}_extract".format(channel), destination_table=bq_extract_table, dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], date_partition_parameter=None, arguments=("--replace", ), sql_file_path= "sql/moz-fx-data-shared-prod/{}/glam_client_probe_counts_extract_v1/query.sql" .format(dataset_id), parameters=("channel:STRING:{}".format(channel), ), dag=dag, ) gcs_delete = GoogleCloudStorageDeleteOperator( task_id="glam_gcs_delete_old_{}_extracts".format(channel), bucket_name=glam_bucket, prefix="aggs-desktop-{}".format(channel), google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag, ) gcs_destination = "gs://{bucket}/aggs-desktop-{channel}-*.csv".format( bucket=glam_bucket, channel=channel) bq2gcs = BigQueryToCloudStorageOperator( task_id="glam_extract_{}_to_csv".format(channel), source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id, bq_extract_table), destination_cloud_storage_uris=gcs_destination, bigquery_conn_id=gcp_conn.gcp_conn_id, export_format="CSV", print_header=False, dag=dag, ) etl_query >> gcs_delete >> bq2gcs return dag
def extract_channel_subdag( parent_dag_name, child_dag_name, default_args, schedule_interval, dataset_id, channel, ): dag = DAG( dag_id="{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args, schedule_interval=schedule_interval, ) bq_extract_table = "glam_client_probe_counts_{}_extract_v1".format(channel) glam_client_probe_counts_extract = bigquery_etl_query( task_id="glam_client_probe_counts_{}_extract".format(channel), destination_table=bq_extract_table, dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], date_partition_parameter=None, arguments=("--replace", ), dag=dag, ) glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator( task_id="glam_gcs_delete_old_{}_extracts".format(channel), bucket_name=glam_bucket, prefix="extract-desktop-{}".format(channel), google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag, ) gcs_destination = "gs://{}/extract-desktop-{}-*.csv".format( glam_bucket, channel) glam_extract_to_csv = BigQueryToCloudStorageOperator( task_id="glam_extract_{}_to_csv".format(channel), source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id, bq_extract_table), destination_cloud_storage_uris=gcs_destination, bigquery_conn_id=gcp_conn.gcp_conn_id, export_format="CSV", print_header=False, dag=dag, ) glam_client_probe_counts_extract >> glam_gcs_delete_old_extracts >> glam_extract_to_csv return dag
def extract_user_counts(parent_dag_name, child_dag_name, default_args, schedule_interval, dataset_id): dag = DAG( dag_id="{}.{}".format(parent_dag_name, child_dag_name), default_args=default_args, schedule_interval=schedule_interval, ) bq_extract_table = "glam_user_counts_extract_v1" etl_query = bigquery_etl_query( task_id="glam_user_counts_extract", destination_table=bq_extract_table, dataset_id=dataset_id, project_id=project_id, owner="*****@*****.**", email=[ "*****@*****.**", "*****@*****.**", "*****@*****.**", ], date_partition_parameter=None, arguments=("--replace", ), dag=dag, ) gcs_delete = GoogleCloudStorageDeleteOperator( task_id="glam_gcs_delete_count_extracts", bucket_name=glam_bucket, prefix="glam-extract-firefox-counts", google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag, ) gcs_destination = "gs://{}/glam-extract-firefox-counts.csv".format( glam_bucket) bq2gcs = BigQueryToCloudStorageOperator( task_id="glam_extract_user_counts_to_csv", source_project_dataset_table="{}.{}.{}".format(project_id, dataset_id, bq_extract_table), destination_cloud_storage_uris=gcs_destination, bigquery_conn_id=gcp_conn.gcp_conn_id, export_format="CSV", print_header=False, dag=dag, ) etl_query >> gcs_delete >> bq2gcs return dag
def get_export_table_operator(table_name, dag=None): """Get templated BigQueryToCloudStorageOperator. Args: table_name (string): Name of the table to export. dag (airflow.models.DAG): DAG used by context_manager. e.g. `with get_dag() as dag: get_export_table_operator(..., dag=dag)`. Defaults to None. Returns: airflow.contrib.operators.bigquery_operator.BigQueryOperator """ if dag is None: logger.warning( 'No DAG context was found. The operator may not be associated to any DAG nor appeared in Web UI' ) date_descriptor = '{{ ds_nodash }}' table_name_with_date_descriptor = \ '{table_name}{date_descriptor}'.format( table_name=table_name, date_descriptor=date_descriptor) return BigQueryToCloudStorageOperator( dag=dag or models._CONTEXT_MANAGER_DAG, task_id='{experiment_name}.{table_name}.export'.format( experiment_name=get_config('experiment_name'), table_name=table_name), source_project_dataset_table= '{gcp_project_name}.{database_name}.{table_name}'.format( gcp_project_name=get_config('gcp_project_name'), database_name='%s_database' % get_config('experiment_name'), table_name=table_name_with_date_descriptor), # TODO: 1GB以上のデータに対応 # https://cloud.google.com/bigquery/exporting-data-from-bigquery#exportingmultiple destination_cloud_storage_uris=[ 'gs://{bucket_name}/{experiment_name}/exported_tables/' '{table_name}/{date_descriptor}/' 'out.csv.gzip'.format( bucket_name=get_config('bucket_name'), experiment_name=get_config('experiment_name'), date_descriptor=date_descriptor, table_name=table_name) ], compression="GZIP")
sql=queries.write_weight_data_to_bq, destination_dataset_table=cfg.BQ_TABLE_WEIGHT, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', use_legacy_sql=False) t4 = GoogleCloudStorageToBigQueryOperator( task_id='write_route_data_to_bq', bucket=cfg.BUCKET, source_objects=[cfg.SOURCE_OBJECT], field_delimiter=';', destination_project_dataset_table=cfg.BQ_TABLE_ROUTE, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', skip_leading_rows=1) t5 = BigQueryOperator(task_id='prepare_and_merge_data', sql=queries.prepare_and_merge_data, use_legacy_sql=False, destination_dataset_table=cfg.BQ_TABLE_MERGE, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE') t6 = BigQueryToCloudStorageOperator( task_id='export_results_to_gcs', source_project_dataset_table=cfg.BQ_TABLE_MERGE, destination_cloud_storage_uris=cfg.DESTINATION_URI, export_format='CSV') t1 >> t2 >> [t3, t4] >> t5 >> t6
def export_to_amplitude( parent_dag_name, dag_name, default_args, project, dataset, table_or_view, s3_prefix, gcs_bucket='moz-fx-data-derived-datasets-amplitude-export', gcp_conn_id='google_cloud_derived_datasets', amplitude_s3_conn='amplitude_s3_conn', amplitude_s3_bucket='com-amplitude-vacuum-mozilla-vacuum-wup'): """Export a bigquery table or view to Amplitude. This uses the BigQueryToCloudStorage operator to export the partition to GCS, then pushes that data to S3. It operates on a temporary table that is dropped after the job is finished. :param str parent_dag_name: Parent dag name :param str dag_name: This dag's name (appended to parent_dag_name) :param str default_args: DAG configuration :param str dataset: BigQuery project containing the table to be exported :param str dataset: BigQuery dataset :param str table_or_view: Table or view name :param str gcs_bucket: The bucket the data will be exported to :param str gcp_conn_id: GCP connection ID :param str amplitude_s3_conn: S3 connection ID :param str amplitude_s3_bucket: The bucket to export data to :param str s3_prefix: The prefix for the s3 objects """ environment = environ['DEPLOY_ENVIRONMENT'] _dag_name = '{}.{}'.format(parent_dag_name, dag_name) with models.DAG(_dag_name, default_args=default_args) as dag: # For now, we assume the view is already updated # See https://github.com/mozilla/bigquery-etl/issues/218 exec_date = '{{ ds }}' # Check that we have data for this date check_sql = ( 'SELECT COUNT(*) ' 'FROM `{}.{}.{}` ' 'WHERE DATE(submission_timestamp) = "{}"' ).format(project, dataset, table_or_view, exec_date) wait_for_data = BigQuerySQLSensorOperator( task_id='wait_for_data', sql=check_sql, bigquery_conn_id=gcp_conn_id, use_legacy_sql=False ) # Create the table with yesterday's data project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id temp_table_name = table_or_view + '_{{ ds_nodash }}' fully_qualified_table_name = '{}.{}.{}'.format(project_id, dataset, temp_table_name) sql = ( 'SELECT * EXCEPT (submission_timestamp) ' 'FROM `{}.{}.{}` ' 'WHERE DATE(submission_timestamp) = "{}"' ).format(project, dataset, table_or_view, exec_date) create_table = BigQueryOperator( task_id='create_temporary_table', sql=sql, destination_dataset_table=fully_qualified_table_name, bigquery_conn_id=gcp_conn_id, use_legacy_sql=False ) directory = '/'.join((environment, s3_prefix, '{{ ds_nodash }}')) extension = '.tsv.gz' # Export from bq to gcs # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501 gcs_uri = 'gs://{}/{}/*{}'.format(gcs_bucket, directory, extension) table_extract = BigQueryToCloudStorageOperator( task_id='bq_to_gcs', source_project_dataset_table=fully_qualified_table_name, destination_cloud_storage_uris=[gcs_uri], bigquery_conn_id=gcp_conn_id, compression='GZIP', export_format='CSV', field_delimiter='\t', print_header=True ) # Push the data to S3 # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 # noqa: E501 s3_push = GoogleCloudStorageToS3Operator( task_id='gcs_to_s3', bucket=gcs_bucket, prefix=directory, delimiter=extension, google_cloud_storage_conn_id=gcp_conn_id, dest_aws_conn_id=amplitude_s3_conn, dest_s3_key='s3://{}/'.format(amplitude_s3_bucket), replace=True ) # Drop the temporary table table_drop = BigQueryOperator( task_id='drop_temp_table', sql='DROP TABLE `{}`'.format(fully_qualified_table_name), bigquery_conn_id=gcp_conn_id, use_legacy_sql=False ) # Delete the GCS data data_delete = GoogleCloudStorageDeleteOperator( task_id='delete_gcs_data', bucket_name=gcs_bucket, prefix=directory, gcp_conn_id=gcp_conn_id ) wait_for_data >> create_table >> table_extract >> s3_push s3_push >> table_drop s3_push >> data_delete return dag
project_id = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id).project_id gcs_bucket = 'moz-fx-data-derived-datasets-amplitude-export' directory = environment + '/fenix/{{ ds_nodash }}/' extension = '.tsv.gz' # Export from bq to gcs # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 table_extract = BigQueryToCloudStorageOperator( task_id='bq_to_gcs', source_project_dataset_table='{project}.telemetry.{table}'.format( project=project_id, table=table_name), destination_cloud_storage_uris=[ 'gs://{bucket}/{dir}*{ext}'.format(bucket=gcs_bucket, dir=directory, ext=extension) ], bigquery_conn_id=gcp_conn_id, compression='GZIP', export_format='CSV', field_delimiter='\t', print_header=True) # Push the data to S3 # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/gcs_to_s3.py#L29 s3_push = GoogleCloudStorageToS3Operator( task_id='gcs_to_s3', bucket=gcs_bucket, prefix=directory, delimiter=extension, google_cloud_storage_conn_id=gcp_conn_id,
write_disposition="WRITE_TRUNCATE", # specify to truncate on writes #ADD YOUR CODE HERE ) train_files = BUCKET + "/chicago_taxi/data/train/" valid_files = BUCKET + "/chicago_taxi/data/valid/" # Tasks to export the results of the previous BigQueryOperators to # Cloud Storage to stage for later AI Platform Training job. bq_export_train_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_train_csv_task", source_project_dataset_table="{}.{}_train_data" .format(DESTINATION_DATASET, model.replace(".", "_")), destination_cloud_storage_uris=[train_files + "{}/train-*.csv" .format(model.replace(".", "_"))], export_format="CSV", print_header=False, dag=dag ) bq_export_valid_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_valid_csv_task", source_project_dataset_table="{}.{}_valid_data" .format(DESTINATION_DATASET, model.replace(".", "_")), destination_cloud_storage_uris=[valid_files + "{}/valid-*.csv" .format(model.replace(".", "_"))], export_format="CSV", print_header=False,
# Reference: https://airflow.apache.org/integration.html#bigqueryoperator t1 = BigQueryOperator( task_id='bq_rec_training_data', bql=bql, destination_dataset_table='%s.recommendation_events' % DATASET, write_disposition='WRITE_TRUNCATE', # specify to truncate on writes dag=dag) # BigQuery training data export to GCS # TODO: Fill in the missing operator name for task #2 which # takes a BigQuery dataset and table as input and exports it to GCS as a CSV training_file = BUCKET + '/data/recommendation_events.csv' t2 = BigQueryToCloudStorageOperator( task_id='bq_export_op', source_project_dataset_table='%s.recommendation_events' % DATASET, destination_cloud_storage_uris=[training_file], export_format='CSV', dag=dag) # ML Engine training job job_id = 'recserve_{0}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M')) job_dir = BUCKET + '/jobs/' + job_id output_dir = BUCKET training_args = [ '--job-dir', job_dir, '--train-files', training_file, '--output-dir', output_dir, '--data-type', 'web_views', '--use-optimized' ] # TODO: Fill in the missing operator name for task #3 which will # start a new training job to Cloud ML Engine
with TemporaryDirectory(prefix='airflowtmp') as tmp_dir: avro_dir = tmp_dir + "/avro" data_path = tmp_dir + "/data.dat" convert_to_array = BigQueryOperator( task_id="convert_to_array", sql=bq_sql, destination_dataset_table=bq_destination_table, write_disposition="WRITE_TRUNCATE", use_legacy_sql=False, dag=dag) cleanup_gcs = BashOperator(task_id="cleanup_gcs", bash_command=f"gsutil -m rm {output_uri}") export_questions_to_gcs = BigQueryToCloudStorageOperator( task_id="export_array_to_gcs", source_project_dataset_table=bq_destination_table, destination_cloud_storage_uris=[output_uri], export_format="Avro") download_avro = BashOperator( task_id="download_avro", bash_command=f"gsutil -m cp {output_uri} avro_dir") count = BigQueryCountOperator(task_id="get_count", dataset="consumption_test", table="tmp_out_table", dag=dag) cmd = (venv_python + " " + dag_dir + "/src/convert_to_memmap.py" + ' --rows {{ ti.xcom_pull(task_ids="get_count") }}' + " --target-path " + avro_dir + " --target-attr values" + " --output-path " + data_path) convert = BashOperator(task_id="convert_avro", bash_command=cmd)
WHERE DATE(timestamp) <= DATE({{ ds }}) GROUP BY movieid ORDER BY avg_rating DESC """, write_disposition="WRITE_TRUNCATE", create_disposition="CREATE_IF_NEEDED", bigquery_conn_id="gcp", dag=dag, ) extract_top_ratings = BigQueryToCloudStorageOperator( task_id="extract_top_ratings", source_project_dataset_table=(os.environ["GCP_PROJECT"] + ":" + os.environ["BIGQUERY_DATASET"] + "." + "rating_results_{{ ds_nodash }}"), destination_cloud_storage_uris=("gs://" + os.environ["RESULT_BUCKET"] + "/{{ ds_nodash }}.csv"), export_format="CSV", bigquery_conn_id="gcp", dag=dag, ) delete_result_table = BigQueryTableDeleteOperator( task_id="delete_result_table", deletion_dataset_table=(os.environ["GCP_PROJECT"] + ":" + os.environ["BIGQUERY_DATASET"] + "." + "rating_results_{{ ds_nodash }}"), bigquery_conn_id="gcp", dag=dag, )
schedule_interval=timedelta(days=1), default_args=default_args) as dag: bq_extract_one_day = BigQueryOperator( task_id='bq_extract_one_day', bql='gcp_smoke/gsob_extract_day.sql', destination_dataset_table= '{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}', write_disposition='WRITE_TRUNCATE', bigquery_conn_id='gcp_smoke', use_legacy_sql=False) bq2gcp_avro = BigQueryToCloudStorageOperator( task_id='bq2gcp_avro', source_project_dataset_table= '{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}', destination_cloud_storage_uris=[ 'gs://{{var.value.gcs_bucket}}/{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/{{ ds_nodash }}/part-*.avro' ], export_format='AVRO', bigquery_conn_id='gcp_smoke', ) bq2gcp_override = BigQueryToCloudStorageOperator( task_id='bq2gcp_override', source_project_dataset_table= '{{var.value.gcq_dataset}}.gsod_partition{{ ds_nodash }}', destination_cloud_storage_uris=[ 'gs://{{var.value.gcs_bucket}}/{{var.value.gcs_root}}/gcp_smoke_bq/bq_to_gcp_avro/99999999/part-*.avro' ], export_format='AVRO', bigquery_conn_id='gcp_smoke', )
use_legacy_sql=False, write_disposition="WRITE_TRUNCATE", ) directory = "/".join((GCS_PREFIX, "{{ ds_nodash }}")) extension = ".csv" # Export from bq to gcs # Docs: https://github.com/apache/airflow/blob/master/airflow/contrib/operators/bigquery_to_gcs.py#L28 # noqa: E501 gcs_uri = "gs://{}/{}/*{}".format(GCS_BUCKET, directory, extension) table_extract = BigQueryToCloudStorageOperator( task_id="bq_to_gcs", source_project_dataset_table=fully_qualified_table_name, destination_cloud_storage_uris=[gcs_uri], bigquery_conn_id=gcp_conn_id, compression="GZIP", export_format="CSV", field_delimiter=",", print_header=False, ) # Drop the temporary table table_drop = BigQueryOperator( task_id="drop_temp_table", sql="DROP TABLE `{}`".format(fully_qualified_table_name), bigquery_conn_id=gcp_conn_id, use_legacy_sql=False, ) create_table >> table_extract >> table_drop
date_partition_parameter=None, arguments=('--replace', ), dag=dag) glam_gcs_delete_old_extracts = GoogleCloudStorageDeleteOperator( task_id="glam_gcs_delete_old_extracts", bucket_name=glam_bucket, prefix="extract-", google_cloud_storage_conn_id=gcp_conn.gcp_conn_id, dag=dag) gcs_destination = "{}/extract-*.csv".format(glam_bucket) glam_extract_to_csv = BigQueryToCloudStorageOperator( task_id="glam_extract_to_csv", source_project_dataset_table="glam_client_probe_counts_extract_v1", destination_cloud_storage_uris=gcs_destination, export_format="CSV", print_header=False, dag=dag) wait_for_main_ping >> latest_versions latest_versions >> clients_daily_scalar_aggregates clients_daily_scalar_aggregates >> clients_daily_keyed_scalar_aggregates clients_daily_scalar_aggregates >> clients_daily_keyed_boolean_aggregates clients_daily_keyed_boolean_aggregates >> clients_scalar_aggregates clients_daily_keyed_scalar_aggregates >> clients_scalar_aggregates clients_scalar_aggregates >> clients_scalar_bucket_counts clients_scalar_aggregates >> scalar_percentiles latest_versions >> clients_daily_histogram_aggregates
def build_dag(): """Build DAG.""" dag = DAG('btc_to_neo4j', schedule_interval='@daily', default_args=DEFAULT_ARGS, catchup=True) # NOTE: It is import to keep elements of this list in this order since it is required later when loading data blockchain_elements = [ 'blocks', 'txns', 'outputs', 'output_addresses', 'inputs' ] load_dependency = None for element in blockchain_elements: table = 'crypto_bitcoin.{element}'.format( element=element) + '_{{ds_nodash}}' bigquery_to_daily_table_task = BigQueryOperator( task_id='{element}_to_daily_table'.format(element=element), sql='bigquery/{element}.sql'.format(element=element), destination_dataset_table=table, write_disposition='WRITE_TRUNCATE', use_legacy_sql=False, dag=dag) filename = '{element}/{element}-*.csv'.format(element=element) destination_pattern = 'gs://{bucket}'.format(bucket=BUCKET) + \ '/neo4j_import/{{macros.ds_format(ds, "%Y-%m-%d", "%Y/%m/%d")}}/' + filename table_to_bucket_task = BigQueryToCloudStorageOperator( task_id='{element}_table_to_bucket'.format(element=element), source_project_dataset_table=table, destination_cloud_storage_uris=[destination_pattern], export_format='csv', field_delimiter=',', print_header=True, dag=dag) load_into_neo4j_task = PythonOperator( task_id="load_{element}_into_neo4j".format(element=element), python_callable=load_into_neo4j, provide_context=True, op_kwargs={'element': element}, pool='neo4j_slot', dag=dag) # NOTE: timestamps in blocks are not strictly incremental and since we query by dates it could happen # that we need to backfill some relations. # See: https://bitcoin.stackexchange.com/questions/67618/difference-between-time-and-mediantime-in-getblock if element == 'blocks': backfill_blocks_in_neo4j_task = PythonOperator( task_id="backfill_blocks_in_neo4j", python_callable=backfill_blocks_in_neo4j, provide_context=True, pool='neo4j_slot', dag=dag) load_into_neo4j_task >> backfill_blocks_in_neo4j_task delete_aux_table = BigQueryTableDeleteOperator( task_id='delete_{element}_table'.format(element=element), deletion_dataset_table=table, dag=dag) bigquery_to_daily_table_task >> table_to_bucket_task >> load_into_neo4j_task table_to_bucket_task >> delete_aux_table # Make sure that we load data in Neo4J in right order if load_dependency is not None: load_dependency >> load_into_neo4j_task load_dependency = load_into_neo4j_task return dag
create_tmp_table = BigQueryOperator( task_id='create_tmp_table', sql=QUERY, use_legacy_sql=False, destination_dataset_table='{}.Temporal.cloudSQLexport_tmp'.format( cfg.PROJECT_ID), write_disposition='WRITE_TRUNCATE', create_disposition='CREATE_IF_NEEDED', bigquery_conn_id=cfg.bigquery_conn_id) create_tmp_csv = BigQueryToCloudStorageOperator( task_id='create_tmp_csv', source_project_dataset_table='{}.Temporal.cloudSQLexport_tmp'.format( BQ_PROJECT), destination_cloud_storage_uris='{}/cloudSQLexport_temp.csv'.format( DIR_TMP), export_format='CSV', print_header=False, bigquery_conn_id=cfg.bigquery_conn_id) import_to_csql = CloudSqlInstanceImportOperator( task_id='import_to_csql', project_id=SQL_PROJECT, instance='servicedat-cal-mysql', body={ "importContext": { "kind": "sql#importContext", "fileType": 'CSV', "uri": '{}/cloudSQLexport_temp.csv'.format(DIR_TMP), "database": DATABASE,
yesterday = datetime.combine(datetime.today() - timedelta(1), datetime.min.time()) default_args = { 'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5) } def get_composer_gcs_bucket(): return os.environ['GCS_BUCKET'] with DAG(dag_id='21_bigquery_operators', catchup=False, schedule_interval=timedelta(days=1), default_args=default_args, user_defined_macros={ 'get_composer_gcs_bucket': get_composer_gcs_bucket, }) as dag: bq_to_gcs = BigQueryToCloudStorageOperator( task_id='bq_to_gcs', source_project_dataset_table='bigquery-public-data:samples.gsod', destination_cloud_storage_uris=[ 'gs://{{ get_composer_gcs_bucket() }}/data/21_bigquery_operators/{{ execution_date }}/gsod-*.csv' ])
SELECT user , store_id , CASE WHEN store_id IS NOT NULL THEN 1 ELSE NULL END AS rating FROM transactions JOIN stores ON transactions.store_id=stores.id WHERE user IN (SELECT user FROM users) AND store_id IN (SELECT id FROM stores) GROUP BY user, store_id ''') t2 = BigQueryToCloudStorageOperator( task_id='bq_to_gcs_store_recs_train', source_project_dataset_table='data_science.store_recs_train', destination_cloud_storage_uris=['gs://store_recs/development/training/{{ ds }}/store_recs_train_*.json'], export_format='NEWLINE_DELIMITED_JSON', delegate_to='*****@*****.**', dag=dag ) t3 = DataprocClusterCreateOperator( task_id='create_cluster', project_id='my_project', cluster_name='cluster-1', num_workers=25, storage_bucket='data-science', init_actions_uris=['gs://initialization-actions/create-my-cluster.sh'], properties= {"spark:spark.executor.instances": "74", "spark:spark.yarn.executor.memoryOverhead": "2048", "spark:spark.yarn.driver.memoryOverhead": "3072", "spark:spark.executor.cores": "5",
# detected in the Cloud Storage bucket. 'start_date': yesterday, # To email on failure or retry set 'email' arg to your email and enable # emailing here. 'email_on_failure': False, 'email_on_retry': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 0, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with DAG( dag_id='table_snapshot', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: start = DummyOperator(task_id='start') end = DummyOperator(task_id='end') bq_table_snapshot = BigQueryToCloudStorageOperator( task_id='bq_table_snapshot', source_project_dataset_table='MyProjectID.DATA_LAKE.table_name', destination_cloud_storage_uris=[destination_cloud_storage_uris1], compression='Snappy', export_format='Avro', bigquery_conn_id='bigquery_default') start >> bq_table_snapshot >> end
'project': project_id, 'region': 'europe-west1', 'staging_location': 'gs://' + project_id + '-dataflow/staging/', 'temp_location': 'gs://' + project_id + '-dataflow/temp/' } } # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG( 'hourly_data_transfer', schedule_interval='0 * * * *', default_args=default_dag_args) as dag: run_dataflow = DataFlowPythonOperator( py_file='gs://' + project_id + '-dataflow/pipelines/batch_pipeline.py', task_id='run_Dataflow_from_BQ_to_Datastore', dataflow_default_options=default_dag_args['dataflow_default_options'] ) bq_to_gcs = BigQueryToCloudStorageOperator( task_id='export_stream_data_from_BQ', source_project_dataset_table='my_dataset.stream_data', destination_cloud_storage_uris=['gs://' + project_id + '-data-export/stream_data.csv'], export_format='CSV') # Define DAG dependencies. run_dataflow bq_to_gcs
def export_to_parquet( table, destination_table=None, static_partitions=[], arguments=[], use_storage_api=False, dag_name="export_to_parquet", parent_dag_name=None, default_args=None, aws_conn_id="aws_dev_iam_s3", gcp_conn_id="google_cloud_derived_datasets", dataproc_zone="us-central1-a", dataproc_storage_bucket="moz-fx-data-derived-datasets-parquet", num_workers=2, num_preemptible_workers=0, gcs_output_bucket="moz-fx-data-derived-datasets-parquet", s3_output_bucket="telemetry-parquet", ): """ Export a BigQuery table to Parquet. https://github.com/mozilla/bigquery-etl/blob/master/script/pyspark/export_to_parquet.py :param str table: [Required] BigQuery table name :param Optional[str] destination_table: Output table name, defaults to table, will have r'_v[0-9]+$' replaced with r'/v[0-9]+' :param List[str] arguments: Additional pyspark arguments :param bool use_storage_api: Whether to read from the BigQuery Storage API or an AVRO export :param str dag_name: Name of DAG :param Optional[str] parent_dag_name: Parent DAG name :param Optional[Dict[str, Any]] default_args: DAG configuration :param str gcp_conn_id: Airflow connection id for GCP access :param str dataproc_storage_bucket: Dataproc staging GCS bucket :param str dataproc_zone: GCP zone to launch dataproc clusters :param int num_preemptible_workers: Number of Dataproc preemptible workers :return: airflow.models.DAG """ # remove the dataset prefix and partition suffix from table table_id = table.rsplit(".", 1)[-1] unqualified_table, _, partition_id = table_id.partition("$") # limit cluster name to 35 characters plus suffix of -export-YYYYMMDD (51 total) cluster_name = unqualified_table.replace("_", "-") if len(cluster_name) > 35: # preserve version when truncating cluster name to 42 characters prefix, version = re.match(r"(.*?)(-v[0-9]+)?$", cluster_name).groups("") cluster_name = prefix[:35 - len(version)] + version cluster_name += "-export-{{ ds_nodash }}" dag_prefix = parent_dag_name + "." if parent_dag_name else "" connection = GoogleCloudBaseHook(gcp_conn_id=gcp_conn_id) if destination_table is None: destination_table = unqualified_table # separate version using "/" instead of "_" export_prefix = re.sub(r"_(v[0-9]+)$", r"/\1", destination_table) + "/" if static_partitions: export_prefix += "/".join(static_partitions) + "/" avro_prefix = "avro/" + export_prefix if not static_partitions and partition_id: avro_prefix += "partition_id=" + partition_id + "/" avro_path = "gs://" + gcs_output_bucket + "/" + avro_prefix + "*.avro" with models.DAG(dag_id=dag_prefix + dag_name, default_args=default_args) as dag: create_dataproc_cluster = DataprocClusterCreateOperator( task_id="create_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, num_workers=num_workers, image_version="1.4", storage_bucket=dataproc_storage_bucket, zone=dataproc_zone, master_machine_type="n1-standard-8", worker_machine_type="n1-standard-8", num_preemptible_workers=num_preemptible_workers, init_actions_uris=[ "gs://dataproc-initialization-actions/python/pip-install.sh", ], metadata={"PIP_PACKAGES": "google-cloud-bigquery==1.20.0"}, ) run_dataproc_pyspark = DataProcPySparkOperator( task_id="run_dataproc_pyspark", cluster_name=cluster_name, dataproc_pyspark_jars=[ "gs://spark-lib/bigquery/spark-bigquery-latest.jar" ], dataproc_pyspark_properties={ "spark.jars.packages": "org.apache.spark:spark-avro_2.11:2.4.4", }, main="https://raw.githubusercontent.com/mozilla/bigquery-etl/master" "/script/pyspark/export_to_parquet.py", arguments=[table] + [ "--" + key + "=" + value for key, value in { "avro-path": (not use_storage_api) and avro_path, "destination": "gs://" + gcs_output_bucket, "destination-table": destination_table, }.items() if value ] + (["--static-partitions"] if static_partitions else []) + [static_partitions] + arguments, gcp_conn_id=gcp_conn_id, ) gcs_to_s3 = DataProcHadoopOperatorWithAws( task_id="gcs_to_s3", main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-distcp.jar", arguments=[ "-update", "-delete", "gs://{}/{}".format(gcs_output_bucket, export_prefix), "s3a://{}/{}".format(s3_output_bucket, export_prefix), ], cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, aws_conn_id=aws_conn_id, ) delete_dataproc_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name=cluster_name, gcp_conn_id=gcp_conn_id, project_id=connection.project_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) if not use_storage_api: avro_export = BigQueryToCloudStorageOperator( task_id="avro_export", source_project_dataset_table=table, destination_cloud_storage_uris=avro_path, compression=None, export_format="AVRO", bigquery_conn_id=gcp_conn_id, ) avro_delete = GoogleCloudStorageDeleteOperator( task_id="avro_delete", bucket_name=gcs_output_bucket, prefix=avro_prefix, gcp_conn_id=gcp_conn_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, ) avro_export >> run_dataproc_pyspark >> avro_delete create_dataproc_cluster >> run_dataproc_pyspark >> gcs_to_s3 gcs_to_s3 >> delete_dataproc_cluster return dag
def preprocess_tasks(model, dag, PROJECT_ID, BUCKET, DATA_DIR): # Constants # Specify your source BigQuery project, dataset, and table names SOURCE_BQ_PROJECT = "nyc-tlc" SOURCE_DATASET_TABLE_NAMES = "yellow.trips,green.trips_2014,green.trips_2015".split(",") # Specify your destination BigQuery dataset DESTINATION_DATASET = "taxifare" # BigQuery data query bql=""" SELECT (tolls_amount + fare_amount) AS fare_amount, EXTRACT(DAYOFWEEK FROM pickup_datetime) * 1.0 AS dayofweek, EXTRACT(HOUR FROM pickup_datetime) * 1.0 AS hourofday, pickup_longitude AS pickuplon, pickup_latitude AS pickuplat, dropoff_longitude AS dropofflon, dropoff_latitude AS dropofflat, passenger_count*1.0 AS passengers, CONCAT(CAST(pickup_datetime AS STRING), CAST(pickup_longitude AS STRING), CAST(pickup_latitude AS STRING), CAST(dropoff_latitude AS STRING), CAST(dropoff_longitude AS STRING)) AS key FROM `{0}.{1}` WHERE trip_distance > 0 AND fare_amount >= 2.5 AND pickup_longitude > -78 AND pickup_longitude < -70 AND dropoff_longitude > -78 AND dropoff_longitude < -70 AND pickup_latitude > 37 AND pickup_latitude < 45 AND dropoff_latitude > 37 AND dropoff_latitude < 45 AND passenger_count > 0 AND rand() < 0.00001 """ bql = bql.format(SOURCE_BQ_PROJECT, model) bql_train = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) < 4".format(bql) bql_eval = "SELECT * EXCEPT (key) FROM({0}) WHERE MOD(ABS(FARM_FINGERPRINT(key)), 5) = 4".format(bql) # Complete the BigQueryOperator task to truncate the table if it already exists before writing # Reference: https://airflow.apache.org/integration.html#bigqueryoperator bq_train_data_op = BigQueryOperator( task_id="bq_train_data_{}_task".format(model.replace(".","_")), bql=bql_train, destination_dataset_table="{}.{}_train_data".format(DESTINATION_DATASET, model.replace(".","_")), write_disposition="WRITE_TRUNCATE", # specify to truncate on writes use_legacy_sql=False, dag=dag ) bq_eval_data_op = BigQueryOperator( task_id="bq_eval_data_{}_task".format(model.replace(".","_")), bql=bql_eval, destination_dataset_table="{}.{}_eval_data".format(DESTINATION_DATASET, model.replace(".","_")), write_disposition="WRITE_TRUNCATE", # specify to truncate on writes use_legacy_sql=False, dag=dag ) sql = """ SELECT COUNT(*) FROM [{0}:{1}.{2}] """ # Check to make sure that the data tables won"t be empty bq_check_train_data_op = BigQueryCheckOperator( task_id="bq_check_train_data_{}_task".format(model.replace(".","_")), sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".","_") + "_train_data"), dag=dag ) bq_check_eval_data_op = BigQueryCheckOperator( task_id="bq_check_eval_data_{}_task".format(model.replace(".","_")), sql=sql.format(PROJECT_ID, DESTINATION_DATASET, model.replace(".","_") + "_eval_data"), dag=dag ) # BigQuery training data export to GCS bash_remove_old_data_op = BashOperator( task_id="bash_remove_old_data_{}_task".format(model.replace(".","_")), bash_command="if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi".format(BUCKET, model.replace(".","_")), dag=dag ) # Takes a BigQuery dataset and table as input and exports it to GCS as a CSV bq_export_gcs_train_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_train_csv_{}_task".format(model.replace(".","_")), source_project_dataset_table="{}.{}_train_data".format(DESTINATION_DATASET, model.replace(".","_")), destination_cloud_storage_uris=[DATA_DIR + "{}/train-*.csv".format(model.replace(".","_"))], export_format="CSV", print_header=False, dag=dag ) bq_export_gcs_eval_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_eval_csv_{}_task".format(model.replace(".","_")), source_project_dataset_table="{}.{}_eval_data".format(DESTINATION_DATASET, model.replace(".","_")), destination_cloud_storage_uris=[DATA_DIR + "{}/eval-*.csv".format(model.replace(".","_"))], export_format="CSV", print_header=False, dag=dag ) return (bq_train_data_op, bq_eval_data_op, bq_check_train_data_op, bq_check_eval_data_op, bash_remove_old_data_op, bq_export_gcs_train_csv_op, bq_export_gcs_eval_csv_op)
import pendulum from function import consts, utils # DAG設定 DAG_NAME = 'dag_bq_to_gcs' default_args = { 'owner': 'terunrun', 'depends_on_past': False, 'start_date': datetime(2021, 2, 22, 0, 0, tzinfo=pendulum.timezone('Asia/Tokyo')), 'email_on_failure': False, 'email_on_retry': False, 'retries': 0, } dag = DAG(DAG_NAME, schedule_interval=None, default_args=default_args, catchup=False) # TASK設定 bq_to_gcs = BigQueryToCloudStorageOperator( task_id='bq_to_gcs', # export対象データセット.テーブル source_project_dataset_table=consts.WARM_DATASET + '.' + consts.DATA_NAME + '_' + '{{ (execution_date + macros.timedelta(hours=+9)).strftime("%Y%m%d") }}', # export先オブジェクト名(uriなのでgs://形式で記述する) destination_cloud_storage_uris='gs://' + consts.TABLE_EXPORT_BUCKET + '/' + consts.FOLDER_NAME + '/' + consts.DATA_NAME + '_' + '{{ (execution_date + macros.timedelta(hours=+9)).strftime("%Y%m%d") }}' # 圧縮形式 compression='GZIP', # export形式 export_format='CSV', # exportファイルにヘッダー行を含めるかどうか print_header=False, dag=dag, )
and sessions.clientId = userId.clientId) select userId as distinct_id, clientId, visitId, visitStartTime, date, fullVisitorId, hit.hitNumber, hit.page.pagePath, hit.appInfo.screenName , hit.page.hostname, hit.page.pageTitle, hit.type as hit_type from data cross join unnest(hits) as hit""" prepare_ga360 = BigQueryOperator( dag=dag, task_id='bq_unnest_table', bigquery_conn_id=google_conn_id, use_legacy_sql=False, sql=bq_extract_query.format(source_table=source_table), destination_dataset_table=target_table) extract_ga360_to_gcs = BigQueryToCloudStorageOperator( dag=dag, task_id='export_recent_yesterday_to_gcs', bigquery_conn_id=google_conn_id, source_project_dataset_table=target_table, destination_cloud_storage_uris=[gcs_output_file], compression='GZIP', export_format='CSV') export_gcs_to_s3 = GoogleCloudStorageToS3Operator( dag=dag, task_id="cp_gcs_to_s3", dest_verify=True, google_cloud_storage_conn_id=google_conn_id, bucket=gcs_bucket, dest_aws_conn_id='local_s3', dest_s3_key=redshift_s3_bucket) load_redshift = S3ToRedshiftTransfer(dag=dag, task_id="redshift_load",
bash_remove_old_data_op = BashOperator( task_id="bash_remove_old_data_{}_task".format(model.replace(".", "_")), bash_command= "if gsutil ls {0}/taxifare/data/{1} 2> /dev/null; then gsutil -m rm -rf {0}/taxifare/data/{1}/*; else true; fi" .format(BUCKET, model.replace(".", "_")), dag=dag) # Takes a BigQuery dataset and table as input and exports it to GCS as a CSV train_files = BUCKET + "/taxifare/data/" bq_export_gcs_train_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_train_csv_{}_task".format( model.replace(".", "_")), source_project_dataset_table="{}.{}_train_data".format( DESTINATION_DATASET, model.replace(".", "_")), destination_cloud_storage_uris=[ train_files + "{}/train-*.csv".format(model.replace(".", "_")) ], export_format="CSV", print_header=False, dag=dag) eval_files = BUCKET + "/taxifare/data/" bq_export_gcs_eval_csv_op = BigQueryToCloudStorageOperator( task_id="bq_export_gcs_eval_csv_{}_task".format(model.replace( ".", "_")), source_project_dataset_table="{}.{}_eval_data".format( DESTINATION_DATASET, model.replace(".", "_")), destination_cloud_storage_uris=[ eval_files + "{}/eval-*.csv".format(model.replace(".", "_"))
task_id='create_temporary_table', sql=sql, destination_dataset_table=fully_qualified_tmp_table, bigquery_conn_id=gcp_conn_id, use_legacy_sql=False ) fully_qualified_table_name = '{}.{}.{}'.format(project, dataset, table) gcs_bucket = 'moz-fx-data-prod-analysis' incline_prefix = 'incline/executive_dash/{{ds}}/data.ndjson' gcs_uri = 'gs://{bucket}/{prefix}'.format(bucket=gcs_bucket, prefix=incline_prefix) table_extract = BigQueryToCloudStorageOperator( task_id='extract_as_latest', source_project_dataset_table=fully_qualified_table_name, destination_cloud_storage_uris=[gcs_uri], bigquery_conn_id=gcp_conn_id, export_format='JSON', compression='GZIP' ) # Drop the temporary table table_drop = BigQueryOperator( task_id='drop_temp_table', sql='DROP TABLE `{}`'.format(fully_qualified_tmp_table), bigquery_conn_id=gcp_conn_id, use_legacy_sql=False ) wait_for_copy_deduplicate >> \ migrated_clients >> \ exec_dash >> \
DATASET_NAME), use_legacy_sql=False, ) copy_from_selected_data = BigQueryToBigQueryOperator( task_id="copy-from-selected-data", source_project_dataset_tables="{}.selected_data_from_external_table". format(DATASET_NAME), destination_project_dataset_table= "{}.copy_of_selected_data_from_external_table".format(DATASET_NAME), ) bigquery_to_gcs = BigQueryToCloudStorageOperator( task_id="bigquery-to-gcs", source_project_dataset_table="{}.selected_data_from_external_table". format(DATASET_NAME), destination_cloud_storage_uris=[ "gs://{}/export-bigquery.csv".format(DATA_EXPORT_BUCKET_NAME) ], ) create_dataset = BigQueryCreateEmptyDatasetOperator( task_id="create-dataset", dataset_id=DATASET_NAME) create_dataset_with_location = BigQueryCreateEmptyDatasetOperator( task_id="create_dataset_with_location", dataset_id=LOCATION_DATASET_NAME, location=BQ_LOCATION) create_table = BigQueryCreateEmptyTableOperator( task_id="create-table", dataset_id=DATASET_NAME,
'end_date': datetime(2020, 9, 30), 'depends_on_past': False, 'retries': 0, 'retry_delay': timedelta(minutes=10) } dag = DAG(dag_id="my-bq-task", default_args=default_args, schedule_interval='@daily') filepath = "gs://backup-data-2020/" + 'out_' + datetime.now().strftime( "%d_%b_%Y:%H:%M:%S.%f") + '.csv' t1 = BigQueryCheckOperator( bigquery_conn_id="my-bq-conn", task_id="check_for_table", dag=dag, use_legacy_sql=False, sql="""select count(*) from `tech-289406.sample.covid`""") t2 = BigQueryToCloudStorageOperator( task_id="dataset_to_gcs", source_project_dataset_table="sample.covid", destination_cloud_storage_uris=[filepath], export_format="CSV", field_delimiter=',', bigquery_conn_id="my-bq-conn", dag=dag, ) t1 >> t2
execute_bq_sql = BigQueryOperator( task_id='execute_bq_sql', sql= query_sql, use_legacy_sql=False, destination_dataset_table=bq_recent_questions_table_id, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_TRUNCATE', dag = dag ) # Se exporta el resultado de la tabla temporal a GCS export_data_groupby = BigQueryToCloudStorageOperator( task_id='export_table_temp_to_gcs', source_project_dataset_table= bq_recent_questions_table_id, destination_cloud_storage_uris='gs://{}/data/archivo_final_agrupado.csv'.format(params['BUCKET_ID']), export_format='CSV', dag = dag ) # La tabla creada con el dataset anterior se elimina ya que es temporal delete_bq_dataset = BashOperator( task_id='delete_table_temp', bash_command='bq rm -f %s' % bq_recent_questions_table_id, trigger_rule=trigger_rule.TriggerRule.ALL_DONE, dag = dag ) # Se suben los archivos de la agrupacion final, en conjunto con los archivos originales
t1 = BigQueryOperator( task_id='bq_rec_training_data', bql=bql, destination_dataset_table='%s.recommendation_events' % DATASET, write_disposition='WRITE_TRUNCATE', # specify to truncate on writes dag=dag) # BigQuery training data export to GCS # TODO: Fill in the missing operator name for task #2 which # takes a BigQuery dataset and table as input and exports it to GCS as a CSV training_file = BUCKET + '/data/recommendation_events.csv' t2 = BigQueryToCloudStorageOperator( task_id='bq_export_op', source_project_dataset_table='%s.recommendation_events' % DATASET, destination_cloud_storage_uris=[training_file], export_format='CSV', dag=dag ) # ML Engine training job job_id = 'recserve_{0}'.format(datetime.datetime.now().strftime('%Y%m%d%H%M')) job_dir = BUCKET + '/jobs/' + job_id output_dir = BUCKET training_args = ['--job-dir', job_dir, '--train-files', training_file, '--output-dir', output_dir, '--data-type', 'web_views', '--use-optimized']