True, "approved": True, "inProg": False, "done": False, "approvedBy": "karakuri", "workflow": workflow_id }) print("TASKS: ", tasks) dag = DAG('sfsc_review_new_airflow_process_tasks', default_args=default_args, schedule_interval=None) start = DummyOperator(task_id='start', default_args=default_args, dag=dag) process = SubDagOperator( task_id='process', subdag=subdag_tasks('sfsc_review_new_airflow_process_tasks', 'process', tasks, default_args), default_args=default_args, dag=dag, ) start >> process
gen_search_terms = BranchPythonOperator(task_id='generate_search_terms', provide_context=True, python_callable=generate_search_terms, dag=dag) email_links = EmailOperator(task_id='email_best_links', to='*****@*****.**', subject='Latest popular links', html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) sub = SubDagOperator(subdag=subdag, task_id='insert_and_id_pop', trigger_rule='one_success', dag=dag) clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format( RAW_TWEET_DIR), task_id='clear_latest', dag=dag) gen_search_terms.set_upstream(fill_search_terms) for term in SEARCH_TERMS: term_without_punctuation = re.sub(r'\W+', '', term) simple_search = PythonOperator( task_id='search_{}_twitter'.format(term_without_punctuation), provide_context=True, python_callable=search_twitter,
"export_data_hql": export_table2_to_gcp_hql, "add_dataproc_partition_hql": add_dataproc_table2_partition_hql, "drop_tmp_table_hql": drop_tmp_table2_hql } } } export_table_params = { 'gcp_warehouse_path': persist_cfg['gcp_warehouse']['casesci_prd'], 'gcp_keyfile': persist_cfg['gcp_keyfile']['casesci_prd'] } export_table_dataproc_config = { "dataproc_cluster": persist_cfg["gcp_prod_project"]["dataproc_cluster"], "region": persist_cfg["gcp_prod_project"]["region"], "gcp_conn_id": persist_cfg["gcp_prod_project"]["gcp_conn_id"] } for op_name in export_table_dict: export_table_dict[op_name]["operator"] = SubDagOperator( subdag=export_to_gcp_dag("crmdata_bids_bounds." + op_name, sync_data2GCP_dag.schedule_interval, persist_cfg['queue'], sync_data2GCP_dag.default_args, export_table_dict[op_name]["hql_dict"], export_table_params, export_table_dataproc_config), task_id=op_name, queue=persist_cfg['queue'], dag=sync_data2GCP_dag)
gen_search_terms = BranchPythonOperator(task_id='generate_search_terms', provide_context=True, python_callable=generate_search_terms, dag=dag) email_links = EmailOperator( task_id='email_best_links', to='*****@*****.**', subject='Latest popular links', html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) sub = SubDagOperator(subdag=subdag, task_id='insert_and_id_pop', trigger_rule='one_success', dag=dag) clear_latest = BashOperator( bash_command='rm -rf {}/latest_links.txt'.format(RAW_TWEET_DIR), task_id='clear_latest', dag=dag) gen_search_terms.set_upstream(fill_search_terms) for term in SEARCH_TERMS: term_without_punctuation = re.sub(r'\W+', '', term) simple_search = PythonOperator( task_id='search_{}_twitter'.format(term_without_punctuation), provide_context=True, python_callable=search_twitter,
dag = DAG( dag_id=DAG_NAME, default_args=args, schedule_interval="@once", ) start = DummyOperator( task_id='start', default_args=args, dag=dag, ) section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', args), default_args=args, dag=dag, ) some_other_task = DummyOperator( task_id='some-other-task', default_args=args, dag=dag, ) section_2 = SubDagOperator( task_id='section-2', subdag=subdag(DAG_NAME, 'section-2', args), default_args=args, dag=dag, )
# DAG tests that a deadlocked subdag is properly caught dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args) subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args) subdag7_task1 = PythonOperator( task_id='test_subdag_fail', dag=subdag7, python_callable=fail) subdag7_task2 = DummyOperator( task_id='test_subdag_dummy_1', dag=subdag7,) subdag7_task3 = DummyOperator( task_id='test_subdag_dummy_2', dag=subdag7) dag7_subdag1 = SubDagOperator( task_id='subdag', dag=dag7, subdag=subdag7) subdag7_task1.set_downstream(subdag7_task2) subdag7_task2.set_downstream(subdag7_task3) # DAG tests that queued tasks are run dag8 = DAG( dag_id='test_scheduled_queued_tasks', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, default_args=default_args) dag8_task1 = PythonOperator( # use delayed_fail because otherwise LocalExecutor will have a chance to # complete the task python_callable=delayed_fail, task_id='test_queued_task',
start_task >> dt_s3 dt_s3 >> dt_sf dt_sf >> end return one_dag ############################################################################# #Defining Main Dag structure ############################################################################# main_dag = DAG( dag_id=parent_dag_name, default_args=default_args, schedule_interval='@once' #schedule_interval=timedelta(minutes=5), #max_active_runs=1 , concurrency=10) database_list = get_database_list(database_include_patterns) #Each database is an independant task that will run in parallel4 for i in database_list: sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, i, '@once'), task_id=i, dag=main_dag, pool='Pool_max_parallel_5', executor=LocalExecutor())