f"create or replace view {v['dag'] + '.' + v['name']} as ({v['select']})" ], postgres_conn_id='etl_postgres') opr_pause.set_downstream(opr_make_view) if 'id' in v.keys() and v['id'] != None: opr_socrata_upload = PythonOperator( task_id=f"upload_{v['name']}", python_callable=destinations.upload_to_socrata, provide_context=True, op_kwargs={ "id": v['id'], "method": v['method'], "table": v['dag'] + '.' + v['name'] }) opr_make_view.set_downstream(opr_socrata_upload) # opr_csv_dump = PostgresOperator( # task_id=f"dump_to_csv_{v['name']}", # sql=f"""COPY (select distinct * from {v['dag'] + '.' + v['name']}) TO '/tmp/{v['name']}.csv' WITH (FORMAT CSV, FORCE_QUOTE *, QUOTE '"', HEADER);""", # postgres_conn_id='etl_postgres' # ) # opr_make_view.set_downstream(opr_csv_dump) # opr_fix_dupes = BashOperator( # task_id=f"dedupe_rows_{v['name']}", # bash_command=f"cat /tmp/{v['name']}.csv | awk '!seen[$0]++' >| /tmp/accela_extract/{v['name']}_deduped.csv", # ) # opr_csv_dump.set_downstream(opr_fix_dupes)
for d, v in od_config['views'].items(): v['name'] = f"{od_name}_{d}" v['dag'] = dag.dag_id # Make view & set downstream of opr_pause opr_make_view = PostgresOperator( task_id=f"make_view_{v['name']}", sql= f"create or replace view {v['dag'] + '.' + v['name']} as ({v['select']})", postgres_conn_id='etl_postgres') opr_pause.set_downstream(opr_make_view) # Get appropriate Operator & set downstream of make_view opr_dump_file = destinations.pg_to_file(v) opr_make_view.set_downstream(opr_dump_file) if v['export'] == 'shapefile': filepath = f"/tmp/{v['name']}.zip" elif v['export'] == 'geojson': filepath = f"/tmp/{v['name']}.json" if 'id' in v.keys() and len( v['id']) > 0 and v['destination'] == 'ago': # Upload to AGO and set downstream of dump_file opr_upload = PythonOperator( task_id=f"upload_{v['name']}", python_callable=destinations.upload_to_ago, op_kwargs={ "id": v['id'], "filepath": filepath
# create dag and schedule a load interval every day at midnight (7am UTC) dag = DAG('extract_and_load', catchup=False, default_args=default_args, schedule_interval=timedelta(days=1), max_active_runs=1) # task to create table if it does not exist task_create_table = PostgresOperator( task_id='task_create_table', sql='./extract_load_pipeline/sql/create_postgres_table.sql', postgres_conn_id='my_local_db', dag=dag) # extracts bq to a gcs bucket as csv task_bq_to_gcs = PythonOperator( task_id='task_bq_to_gcs', python_callable=bq_to_gcs, provide_context=True, op_kwargs={'start_date': default_args['start_date']}, dag=dag) # loads postgres table from csv task_gcs_to_postgres = PythonOperator(task_id='task_gcs_to_postgres', python_callable=load_table, provide_context=True, dag=dag) task_create_table.set_downstream(task_bq_to_gcs) task_bq_to_gcs.set_downstream(task_gcs_to_postgres)
DROP VIEW IF EXISTS model_draft.final_result_hoes CASCADE; DROP MATERIALIZED VIEW IF EXISTS model_draft.substations_to_drop_hoes CASCADE; DROP MATERIALIZED VIEW IF EXISTS model_draft.buffer_75_hoes CASCADE; DROP MATERIALIZED VIEW IF EXISTS model_draft.buffer_75_a_hoes CASCADE; DROP VIEW IF EXISTS model_draft.summary_de_hoes CASCADE; DROP MATERIALIZED VIEW IF EXISTS model_draft.summary_hoes CASCADE; DROP VIEW IF EXISTS model_draft.summary_total_hoes CASCADE; DROP VIEW IF EXISTS model_draft.substation_hoes CASCADE; DROP VIEW IF EXISTS model_draft.relation_substations_with_hoes CASCADE; DROP VIEW IF EXISTS model_draft.node_substations_with_hoes CASCADE; DROP VIEW IF EXISTS model_draft.way_substations_with_hoes CASCADE; DROP VIEW IF EXISTS model_draft.way_substations CASCADE; ''', ) create_table_ego_grid_ehv_substation.set_downstream(create_view_way_substations) create_view_way_substations.set_downstream(create_view_way_substations_with_hoes) create_view_way_substations_with_hoes.set_downstream(create_view_node_substations_with_hoes) create_view_node_substations_with_hoes.set_downstream(create_view_relation_substations_with_hoes) create_view_relation_substations_with_hoes.set_downstream(create_view_substation_hoes) create_view_substation_hoes.set_downstream(create_view_summary_total_hoes) create_view_summary_total_hoes.set_downstream(create_view_summary_hoes) create_view_summary_hoes.set_downstream(create_index_gist) create_index_gist.set_downstream(create_view_summary_de_hoes) create_view_summary_de_hoes.set_downstream(create_view_buffer_75_hoes) create_view_buffer_75_hoes.set_downstream(create_view_buffer_75_a_hoes) create_view_buffer_75_a_hoes.set_downstream(create_view_substations_to_drop_hoes) create_view_substations_to_drop_hoes.set_downstream(create_view_final_result_hoes) create_view_final_result_hoes.set_downstream(insert_into_ego_grid_ehv_substation) insert_into_ego_grid_ehv_substation.set_downstream(drop)
"create index demos_pid_idx on graphql.demos using btree(parcelno)", "alter table graphql.demos add constraint demos_to_pid foreign key (parcelno) references graphql.parcels(parcelno)" ], "property_sales": [ "drop table if exists graphql.sales cascade", "create table graphql.sales as (select * from assessor.sales s where s.pnum in (select parcelno from graphql.parcels))", "alter table graphql.sales rename column pnum to parcelno", "create index sales_pid_idx on graphql.sales using btree(parcelno)", "alter table graphql.sales add constraint sales_to_pid foreign key (parcelno) references graphql.parcels(parcelno)" ], "ownership": [ "drop table if exists graphql.ownership cascade", "create table graphql.ownership as (select * from assessor.parcelmaster pm where pm.pnum in (select parcelno from graphql.parcels))", "alter table graphql.ownership rename column pnum to parcelno", "create index ownership_pid_idx on graphql.ownership using btree(parcelno)", "alter table graphql.ownership add constraint ownership_to_pid foreign key (parcelno) references graphql.parcels(parcelno)" ] } for key in views: value = views[key] opr_make_views = PostgresOperator(task_id=f"make_{key}", sql=value, postgres_conn_id='etl_postgres') opr_make_parcels.set_downstream(opr_make_views) opr_make_views.set_downstream(opr_make_pgfunctions) opr_make_pgfunctions >> opr_dump_schema >> opr_transfer_schema >> opr_transfer_script >> opr_kill_postgraphile >> opr_create_db >> opr_restart_postgraphile
t.set_upstream(dummy_op) t.set_downstream(create_temp_scores_table_op) archive_trained_models_op = BashOperator( task_id='archive_trained_models', bash_command='scripts/bash/archive_trained_models.sh', dag=dag ) notify_processing_completion_op = SlackAPIPostOperator( task_id='notify_processing_completion', token=Variable.get('slack_token'), channel='#engineering-commits', username='******', icon_url=Variable.get('tia_slack_icon_url'), text='*user_work_experience_job_posting_similarity_scores* has been refreshed on {{ts}}', dag=dag ) create_temp_scores_table_op.set_downstream(copy_scores_to_temp_table_op) copy_scores_to_temp_table_op.set_downstream(remove_scores_op) copy_scores_to_temp_table_op.set_downstream(update_scores_op) delete_temp_scores_table_op.set_upstream(remove_scores_op) delete_temp_scores_table_op.set_upstream(update_scores_op) delete_temp_scores_table_op.set_downstream(notify_processing_completion_op) dummy_op.set_upstream(compute_title_feature_op) dummy_op.set_upstream(compute_skill_feature_op) dummy_op.set_upstream(compute_description_feature_op) dummy_op.set_downstream(archive_trained_models_op)
dag = DAG('transfer_data', catchup=False, default_args=default_args, schedule_interval=None, max_active_runs=1) task_create_tables = PostgresOperator( task_id='task_create_tables', sql=SQL_PATH + 'create_transfer_and_aggregate_tables.sql', postgres_conn_id='my_local_db', dag=dag) task_create_views = PostgresOperator(task_id='task_create_views', sql=SQL_PATH + 'create_views.sql', postgres_conn_id='my_local_db', dag=dag) task_load_transfer_table = PythonOperator(task_id='task_load_transfer_table', python_callable=load_transfer, provide_context=True, dag=dag) task_transfer_to_aggregate_table = PostgresOperator( task_id='task_transfer_to_aggregate_table', sql=SQL_PATH + 'insert_into_aggregate.sql', postgres_conn_id='my_local_db', dag=dag) task_create_tables.set_downstream(task_create_views) task_create_views.set_downstream(task_load_transfer_table) task_load_transfer_table.set_downstream(task_transfer_to_aggregate_table)
send_email(to=[ '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', '*****@*****.**', ], subject=f"Dockless Stats for { kwargs['yesterday_ds'] }", html_content=email_template) return True email_task = PythonOperator(task_id='scoot_stat_email', python_callable=email_callback, provide_context=True, dag=dag) set_xcom = PythonOperator(task_id="computing_stats", provide_context=True, python_callable=set_xcom_variables, dag=dag) task1.set_downstream(set_xcom) task2.set_downstream(set_xcom) email_task.set_upstream(set_xcom)
bash_command='echo "works" ', dag=dag, ) # t7 = BashOperator( # task_id='delete_previousdays_records', # depends_on_past=False, # bash_command='echo "works" ', # dag=dag, # ) #Setting Up Dependencies # This means that t2 will depend on t1 # running successfully to run. t1.set_downstream(t3) # similar to above where t3 will depend on t1 t3.set_upstream(t2) t3.set_downstream(t4) t4.set_downstream(t5) t3.set_downstream(t4) # The bit shift operator can also be # used to chain operations: # t1 >> t2 # And the upstream dependency with the # bit shift operator: # t2 << t1
task_id=f"truncate_{tables[t]}_update", sql=f"truncate table accela.{tables[t]}_update", postgres_conn_id='etl_postgres') opr_table_pause = BashOperator(task_id=f"pause_{tables[t]}", bash_command="echo 'Fake'") for f in txtfile: opr_copy_table = PostgresOperator( task_id=f"copy_{tables[t]}_{f.split('_')[0]}", sql=[ f"""COPY accela.{tables[t]}_update FROM '/tmp/accela_extract/{f}' (FORMAT CSV, HEADER TRUE, QUOTE '"', DELIMITER '|', ENCODING 'latin-1')""" ], postgres_conn_id='etl_postgres') opr_truncate_table.set_downstream(opr_copy_table) opr_copy_table.set_downstream(opr_table_pause) opr_insert_update = PostgresOperator( task_id=f"insert_{tables[t]}_{f.split('_')[0]}", sql= f"insert into accela.{tables[t]} select * from accela.{tables[t]}_update", postgres_conn_id='etl_postgres') opr_table_pause.set_downstream(opr_insert_update) # # Create Postgres tables # for t in tables: # opr_make_pgtables = PythonOperator( # task_id='csv_to_pg_'+t.lower().replace('_',''), # provide_context=True,