for connection in sakila_connections:
    for table in sakila_tables:
        extract = MySqlToGoogleCloudStorageOperator(
            task_id="extract_mysql_%s_%s" % (connection, table),
            mysql_conn_id=connection,
            google_cloud_storage_conn_id='gcp_test',
            sql="SELECT *, '%s' as source FROM sakila.%s" %
            (connection, table),
            bucket='ghen-airflow',
            filename="%s/%s/%s{}.json" % (connection, table, table),
            schema_filename="%s/schemas/%s.json" % (connection, table),
            dag=dag)

        load = GoogleCloudStorageToBigQueryOperator(
            task_id="load_bq_%s_%s" % (connection, table),
            bigquery_conn_id='gcp_test',
            google_cloud_storage_conn_id='gcp_test',
            bucket='ghen-airflow',
            destination_project_dataset_table="spark-test-173322.%s.%s" %
            (connection, table),
            source_objects=["%s/%s/%s*.json" % (connection, table, table)],
            schema_object="%s/schemas/%s.json" % (connection, table),
            source_format='NEWLINE_DELIMITED_JSON',
            create_disposition='CREATE_IF_NEEDED',
            write_disposition='WRITE_TRUNCATE',
            project_id='spark-test-173322',
            dag=dag)

        load.set_upstream(extract)
        slack_notify.set_upstream(load)
Exemplo n.º 2
0
try : 
	t5_prime_tableCheck=BigQueryCheckOperator(task_id='checkForTable',
		sql="SELECT COUNT(*) FROM `{}.{}.{}`".format(project_id,dataset_id,table_id),
		bigquery_conn_id=bigquery_conn_id,
		use_legacy_sql=False,
		dag=dag)
	t5_prime_tableCheck.set_upstream(writeToGCS_task)
	storageToBQ_task = GoogleCloudStorageToBigQueryOperator(task_id='uploadtoBQ_{}'.format(datetime.now().strptime('%Y%m%d_%H%M')),
	bucket=bucket_name,
	source_objects=tempGCS_dir_paths,
	destination_project_dataset="`{}.{}.{}`".format(project_id,dataset_id,table_id),
	schema_fields=schema,
	create_disposition='WRITE_TRUNCATE',
	dag=dag)
	storageToBQ_task.set_upstream(t5_prime_tableCheck)
	dummy_task.set_upstream(storageToBQ_task)
except Exception as e : 
	print("BigQueryCheck error = {}".format(e))
	t5_gamme_tableCreate = BigQueryCreateExternalTableOperator(task_id='CreateBQtable',
		bucket=bucket_name,
		source_objects=tempGCS_dir_paths,
		destination_project_dataset_table="{}.{}.{}"format(project_id,dataset_id,dataset_id),
		schema=schema,
		dag=dag)
	dummy_task.set_upstream(t5_gamme_tableCreate)


'''
Read in queries stored in GCS bucket
Exemplo n.º 3
0
    end = DummyOperator(
        task_id='end',
        trigger_rule='one_success'
    )

    # Task to copy files from S3 to GCS
    s3_email_to_gcs = S3ToGoogleCloudStorageOperator(
        task_id='s3_to_gcs',
        bucket= S3_BUCKET, 
        aws_conn_id=AWS_CONNECTION,         
        dest_gcs_conn_id= GCS_BUCKET_CONNECTION,
        dest_gcs=GCS_BUCKET_NAME_S3_TASK
    )
    s3_email_to_gcs.set_upstream(start)

    # Task to load files from GCS into BQ 
    gcs_to_bq_task = GoogleCloudStorageToBigQueryOperator(
        task_id='gcs_to_bq',
        bucket=GCS_BUCKET_NAME_BQ_TASK,
        source_objects= ['*.parquet'], # Assumes we're loading parquet files, other file types are supported
        destination_project_dataset_table=DESTINATION_TABLE, 
        source_format='parquet',
        skip_leading_rows=1,
        max_bad_records=10,
        bigquery_conn_id= BQ_CONNECTION,
        create_disposition='CREATE_IF_NEEDED', # Creates table in BQ if none is found
        write_disposition='WRITE_APPEND' 
    )
    gcs_to_bq_task.set_upstream(s3_email_to_gcs)