task2 = CsvToStorage( task_id = 'save_imoveis' ) task3 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='cloud_to_bigquery', bucket='aluguel-data-scraper', source_format='CSV', source_objects=['alugueis.csv'], destination_project_dataset_table='alugueis.alugueis', schema_fields=[ {'name': 'Valor', 'type': 'INTEGER', 'mode': 'NULLABLE'}, {'name': 'TipoAluguel', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'Iptu', 'type': 'INTEGER', 'mode': 'NULLABLE'}, {'name': 'Area', 'type': 'INTEGER', 'mode': 'NULLABLE'}, {'name': 'Quartos', 'type': 'INTEGER', 'mode': 'NULLABLE'}, {'name': 'Banheiros', 'type': 'INTEGER', 'mode': 'NULLABLE'}, {'name': 'Vagas', 'type': 'INTEGER', 'mode': 'NULLABLE'}, {'name': 'Cidade', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'Endereco', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'TipoNegociacao', 'type': 'STRING', 'mode': 'NULLABLE'}, ], write_disposition='WRITE_TRUNCATE', skip_leading_rows=1, autodetect=True, dag=dag ) #fluxo de execução task1 task2.set_upstream([task1])
##notice trigger_rule="all_done" bash_run_report_remotly_cmd = 'gcloud beta compute --project myProjectName ssh scheduler2 --internal-ip --zone us-central1-a --command "sudo -u omid python3 /home/omid/gam_data_transfer/report_example_using_service_account_with_date_range.py --start ' + temp_date + " --end " + temp_date + '"' run_report_remotly = BashOperator( task_id='run_report_remotly_' + temp_date, retries=0, bash_command=bash_run_report_remotly_cmd, trigger_rule="all_done") start.set_downstream(bash_cleanup) bash_cleanup.set_downstream(run_report_remotly) run_report_remotly.set_downstream(wait) ##notice trigger_rule="all_done" run_gsutil_mv = BashOperator(task_id='bash_gsutil_mv_cmd', retries=0, bash_command=bash_gsutil_mv_cmd, trigger_rule="all_done") load_to_bq_from_gcs = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='load_to_bq_from_gcs', source_objects='*', skip_leading_rows=1, create_disposition='CREATE_NEVER', write_disposition='WRITE_TRUNCATE', #overwrite? bucket='myBucket/google/gam/example_report', destination_project_dataset_table= 'DATA_LAKE_GOOGLE_US.example_report_partitioned') wait >> run_gsutil_mv >> load_to_bq_from_gcs >> end
project_dm = 'dmgcp-ingestion-poc' location = 'US' bq_connection_id = 'bigquery_default' default_dag_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': airflow.utils.dates.days_ago(0), 'retries': 1, 'retry_delay': timedelta(minutes=1), 'project_id': project_dm, 'schedule_interval': None } dag = DAG(dag_id='test_wrt_3', default_args=default_dag_args) load_csv = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq_example', bucket='ebcidc-to-bq-testing-us-central', source_objects=['csv/wrt/wrt_output.csv'], destination_project_dataset_table='airflow_test.wrt_3', schema_objects= 'gs://kubernetes-staging-85897c950b/Dags/json/wrt-00000-of-00001.json', write_disposition='WRITE_TRUNCATE', dag=dag, bigquery_conn_id=bq_connection_id, google_cloud_storage_conn_id=bq_connection_id) load_csv
table_dest = record['table_dest'] BQ_to_GCS = bigquery_to_gcs.BigQueryToCloudStorageOperator( # Replace ":" with valid character for Airflow task task_id='{}_BQ_to_GCS'.format(table_source.replace(":", "_")), source_project_dataset_table=table_source, destination_cloud_storage_uris=[ '{}-*.avro'.format('gs://' + gcs_bucket + '/' + table_source) ], export_format='AVRO') # GCS_to_GCS = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( # # Replace ":" with valid character for Airflow task # task_id='{}_GCS_to_GCS'.format(table_source.replace(":", "_")), # source_bucket=source_bucket, # source_object='{}-*.avro'.format(table_source), # destination_bucket=dest_bucket, # # destination_object='{}-*.avro'.format(table_dest) # ) GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( # Replace ":" with valid character for Airflow task task_id='{}_GCS_to_BQ'.format(table_dest.replace(":", "_")), bucket=gcs_bucket, source_objects=['{}-*.avro'.format(table_source)], destination_project_dataset_table=table_dest, source_format='AVRO', write_disposition='WRITE_TRUNCATE') start >> BQ_to_GCS >> GCS_to_BQ >> end
} dag = models.DAG('build_train_deploy', default_args=default_dag_args) #[END dag_build_train_deploy] # instantiate Google Cloud base hook to get credentials and create automl clients gcp_hook = GoogleCloudBaseHook(conn_id='google_cloud_default') automl_client = AutoMlClient(credentials=gcp_hook._get_credentials()) # Loads the database dump from Cloud Storage to BigQuery t1 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="db_dump_to_bigquery", bucket=COMPOSER_BUCKET_NAME, source_objects=[DB_DUMP_FILENAME], schema_object="schema_source.json", source_format="CSV", skip_leading_rows=1, destination_project_dataset_table="{}.{}.{}".format( PROJECT, DATASET, 'data_source'), create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", dag=dag) # Clean the data from BigQuery to BigQuery t2 = bigquery_operator.BigQueryOperator( task_id='bq_from_source_to_clean', bql='{}/common/clean.sql'.format(GCS_SQL), use_legacy_sql=False, allow_large_results=True, destination_dataset_table="{}.{}.{}".format(PROJECT, DATASET, 'data_cleaned'), create_disposition="CREATE_IF_NEEDED",
default_dag_args = { 'start_date': datetime.datetime(2018, 1, 1), } with models.DAG('Airflow', default_args=default_dag_args) as dag: def cleaning(): df = pd.read_csv("gs://heena_dawani/P9-ConsoleGames.csv") df["Platform"].fillna("N/A", inplace=True) df["NA_Sales"].fillna(0, inplace=True) df["EU_Sales"].fillna(0, inplace=True) df["JP_Sales"].fillna(0, inplace=True) df["Other_Sales"].fillna(0, inplace=True) new_data = df.dropna(axis=0, how='any') new_data.to_csv("gs://heena_dawani/games.csv") Cleaning = python_operator.PythonOperator(task_id='Cleaning', python_callable=cleaning) GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='GCS_to_BQ', bucket='heena_dawani', skip_leading_rows=1, autodetect=True, source_objects=["games.csv"], destination_project_dataset_table='heena_dawani.heena', source_format='CSV', write_disposition='WRITE_TRUNCATE') Cleaning >> GCS_to_BQ
'retry_delay': datetime.timedelta(minutes=5), } # -------------------------------------------------------------------------------- # Main DAG # -------------------------------------------------------------------------------- # Define a DAG (directed acyclic graph) of tasks. # Any task you create within the context manager is automatically added to the # DAG object. with models.DAG('comoser_sample_bq_copy_across_locations', default_args=default_args, schedule_interval=None) as dag: start = dummy_operator.DummyOperator(task_id='start', trigger_rule='all_success') end = dummy_operator.DummyOperator(task_id='end', trigger_rule='all_success') GCS_to_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( # Replace ":" with valid character for Airflow task task_id='copy_csv_from_storage_to_bq', bucket='baketto1', source_objects='avocado.csv', destination_project_dataset_table= 'micro-store-218714.avocadoDataset.avocado5', source_format='CSV', write_disposition='WRITE_TRUNCATE') start >> GCS_to_BQ >> end
dst= 'data/spinnup/fraud_detection_v01/year={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y\') }}/month={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y%m\') }}/day={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y%m%d\') }}/v01_results{{ds_nodash}}.csv', bucket='{{ var.value.project_bucket }}', conn_id='google_cloud_default', src='%s/v01_results.csv' % (DIR), dag=dag) # Load results from GCS to BQ load_to_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='load_to_bq', bucket='{{ var.value.project_bucket }}', source_objects=[ 'data/spinnup/fraud_detection_v01/year={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y\') }}/month={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y%m\') }}/day={{ macros.ds_format(ds, \'%Y-%m-%d\', \'%Y%m%d\') }}/v01_results{{ds_nodash}}.csv' ], destination_project_dataset_table= 'umg-data-science.detect_fraud_spinnup.v01_results${{ ds_nodash }}', schema_object= 'builds/flow/schemas/spinnup/spinnup_fraud_detection_v01_schema.json', source_format='CSV', field_delimiter=',', bigquery_conn_id='bigquery_default', skip_leading_rows=0, write_disposition='WRITE_TRUNCATE', dag=dag) # Delete directory from airflow instance delete_directory = BashOperator(task_id='delete_directory_from_instance', bash_command='rm -rf {}'.format(DIR), dag=dag) # dag structure spinnup_partition_sensor >> generate_dataset >> mkdir >> download_script >> generate_results >> upload_to_gcs >> load_to_bq >> delete_directory
MOVE_SFTP_FILE_TO_GCS = sftp_operator.SFTPOperator( task_id="move_sftp_file_to_gcs", ssh_conn_id=SSH_CONN_ID, google_cloud_storage_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID, local_filepath=LOCAL_FILEPATH, remote_filepath=REMOTE_FILEPATH, operation=OPERATION, confirm=True) MOVE_GCS_FILE_TO_BQ = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id="move_gcs_file_to_bq", bucket=BUCKET, source_objects=[SOURCE_OBJECTS], destination_project_dataset_table="{{0}_temp}".format( DESTINATION_PROJECT_DATASET_TABLE_MAIN), skip_leading_rows=1, autodetect=True, source_format=SOURCE_FORMAT, create_disposition=CREATE_DISPOSITION, write_disposition='WRITE_APPEND', google_cloud_storage_conn_id=GOOGLE_CLOUD_STORAGE_CONN_ID, bigquery_conn_id=BIGQUERY_CONN_ID) BQ_TO_BQ_MERGING = bigquery_operator.BigQueryOperator( task_id="gbq_merging_tables", sql=SQL, use_legacy_sql=False, destination_project_dataset_table= DESTINATION_PROJECT_DATASET_TABLE_MAIN, write_disposition='WRITE_EMPTY', create_disposition=CREATE_DISPOSITION,
jsonl_file_names_format = src_nodes_ways_relations_gcs_dir + '{}_{}.jsonl' for element_and_schema in elements_and_schemas: element, schema = element_and_schema task_id = element + '_json_to_bq_{}_{}'.format( index + 1, addt_mn_gke_pool_num_nodes) source_object = jsonl_file_names_format.format(element, index) destination_dataset_table = '{}.history_{}'.format( bq_dataset_to_export, element) json_to_bq_task = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id=task_id, bucket=src_nodes_ways_relations_gcs_bucket, source_objects=[source_object], source_format='NEWLINE_DELIMITED_JSON', destination_project_dataset_table=destination_dataset_table, schema_fields=schema, write_disposition='WRITE_APPEND', max_bad_records=max_bad_records_for_bq_export, retries=5, dag=dag) json_to_bq_tasks.append(json_to_bq_task) remove_json_task = gcs_delete_operator.GoogleCloudStorageDeleteOperator( task_id='remove_json-{}-{}-{}'.format( element, index + 1, addt_mn_gke_pool_num_nodes), bucket_name=src_nodes_ways_relations_gcs_bucket, objects=[source_object]) json_to_bq_task.set_downstream(remove_json_task) update_result_shard_timestamp_task = python_operator.PythonOperator( task_id='update-result-shard-timestamp-{}-{}-{}'.format(
bucket=source_bucket, ts_func=context['dag'].context['execution_date'], object= "{{ task_instance.xcom_pull(task_ids='get_file', key='file_name') }}", provide_context=True) fork = BranchPythonOperator(task_id='fork', python_callable=branch, provide_context=True) sink_bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='sink_bq', bucket=source_bucket, source_objects=[ "{{ task_instance.xcom_pull(task_ids='get_file', key='file_name') }}" ], # pulls file_name destination_project_dataset_table='omniture.omniture_raw_2017_2027', schema_fields=bq_schema, compression=compression_type, field_delimiter=field_delimiter_type, write_disposition=write_disposition_type, provide_context=True) move_sink_file = gcs_to_gcs.GoogleCloudStorageToGoogleCloudStorageOperator( task_id='move_sink_file', source_bucket=source_bucket, source_object= "{{ task_instance.xcom_pull(task_ids='get_file', key='file_name') }}", destination_bucket=del_bucket, destination_object='processed_sink_files/' + "{{ task_instance.xcom_pull(task_ids='get_file', key='file_name') }}", move_object=True,
from builtins import range from datetime import timedelta from airflow.models import DAG from airflow.utils.dates import days_ago from airflow.contrib.operators import gcs_to_bq args = { 'owner': 'Airflow', 'start_date': days_ago(2), } dag = DAG(dag_id='Big_Data_Demystified_gcs_to_bq_example', default_args=args, schedule_interval='0 0 * * *', dagrun_timeout=timedelta(minutes=60), tags=['example']) gcs2bq = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='Big_Data_Demystified_gcs2bq', bucket='MyBucket', source_objects='*', destination_project_dataset_table='MyDataSet.MyTable', skip_leading_rows=1, default_args=args, dag=dag, )
# To email on failure or retry set 'email' arg to your email and enable # emailing here. 'email_on_failure': False, 'email_on_retry': False, # If a task fails, retry it once after waiting at least 5 minutes 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), 'project_id': models.Variable.get('gcp_project') } with models.DAG( 'android_reviews_load_to_bigQuery', # Continue to run DAG once per day schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: #load from local bucket o GCS table of android logging.error('trying to GCS_TO_BQ: ') load_to_bq_from_gcs = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='load_to_bq_from_gcs', source_objects='*', skip_leading_rows=1, write_disposition='WRITE_TRUNCATE', #overwrite? bucket=DST_BUCKET_UTF8, destination_project_dataset_table='DATA.Replica_android_review') # Define DAG dependencies. #create_dataproc_cluster >> run_dataproc_hadoop >> delete_dataproc_cluster load_to_bq_from_gcs
provide_context=True, dag=dag) t3 = file_to_gcs.FileToGoogleCloudStorageOperator(task_id="upload_raw_data", src="user.csv", dst="data/user.csv", retry_delay=5, retries=1, bucket="asia-northeast1-example-env-c50e72d7-bucket", dag=dag) t4 = gcs_to_bq.GoogleCloudStorageToBigQueryOperator(task_id="load_into_bq", bucket="asia-northeast1-example-env-c50e72d7-bucket", source_objects=["data/user.csv"], schema_fields=[ {'name': 'active', 'type': 'BOOL'}, {'name': 'email', 'type': 'STRING'}, {'name': 'employee_number', 'type': 'STRING'}, {'name': 'name', 'type': 'STRING'}, {'name': 'location', 'type': 'STRING'}, {'name': 'sys_tags', 'type': 'STRING'} ], destination_project_dataset_table="composer_demo.user", write_disposition='WRITE_TRUNCATE', autodetect=True, dag=dag) t1 >> t2 >> t3 >> t4
bash_api_call_GET_DESKTOP_TRAFFIC = BashOperator( task_id='bash_api_call_GET_DESKTOP_TRAFFIC' + single_date.strftime("%Y%m%d"), bash_command=bash_cmd) bash_cmd2 = """gsutil mv /tmp/file_""" + single_date.strftime( "%Y%m%d") + '.json gs://data_lake/similar_web_desktop_traffic/' bash_gsutil_mv_files_to_ingestion = BashOperator( task_id='bash_gsutil_mv_files_to_ingestion' + single_date.strftime("%Y%m%d"), bash_command=bash_cmd2) #bash_cmd="""ls""" #bash_api_call_GET_DESKTOP_TRAFFIC = BashOperator(task_id='bash_opr_'+str(item),bash_command=bash_cmd) start.set_downstream(bash_api_call_GET_DESKTOP_TRAFFIC) bash_api_call_GET_DESKTOP_TRAFFIC.set_downstream( bash_gsutil_mv_files_to_ingestion) bash_gsutil_mv_files_to_ingestion.set_downstream(wait) load_to_bg_GET_DESKTOP_TRAFFIC = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='load_to_bg_GET_DESKTOP_TRAFFIC', source_objects=['*'], write_disposition='WRITE_TRUNCATE', #overwrite? create_disposition='CREATE_IF_NEEDED', bucket=DST_BUCKET, destination_project_dataset_table=dst_table, autodetect='true') end = DummyOperator(task_id='end') wait >> load_to_bg_GET_DESKTOP_TRAFFIC >> end
create_test_dataset = bash_operator.BashOperator( task_id='create_airflow_test_dataset', bash_command='bq mk airflow_test', dag=dag) # [START howto_operator_gcs_to_bq] load_csv = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='gcs_to_bq_example', bucket='cloud-samples-data', source_objects=['bigquery/us-states/us-states.csv'], destination_project_dataset_table='airflow_test.gcs_to_bq_table', schema_fields=[ { 'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'post_abbr', 'type': 'STRING', 'mode': 'NULLABLE' }, ], write_disposition='WRITE_TRUNCATE', dag=dag) # [END howto_operator_gcs_to_bq] delete_test_dataset = bash_operator.BashOperator( task_id='delete_airflow_test_dataset', bash_command='bq rm -rf airflow_test', dag=dag)
move_users_data_to_staging = gcs_to_bq.GoogleCloudStorageToBigQueryOperator( task_id='move_users_data_to_staging', bucket='demo-sample', source_objects=['so_users.json'], write_disposition='WRITE_TRUNCATE', schema_fields=[{ 'name': 'id', 'type': 'INTEGER', 'mode': 'REQUIRED' }, { 'name': 'display_name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'age', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'reputation', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'up_votes', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'down_votes', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'views', 'type': 'INTEGER', 'mode': 'NULLABLE' }], skip_leading_rows=1, ignore_unknown_values=True, allow_jagged_rows=True, create_disposition='CREATE_IF_NEEDED', source_format='NEWLINE_DELIMITED_JSON', retries=2, retry_delay=timedelta(seconds=15), destination_project_dataset_table='demo_temp.stackoverflow_users', dag=dag)