def test_subdag_pools(self): """ Subdags and subdag tasks can't both have a pool with 1 slot """ dag = DAG('parent', default_args=default_args) subdag = DAG('parent.child', default_args=default_args) session = airflow.settings.Session() pool_1 = airflow.models.Pool(pool='test_pool_1', slots=1) pool_10 = airflow.models.Pool(pool='test_pool_10', slots=10) session.add(pool_1) session.add(pool_10) session.commit() dummy_1 = DummyOperator(task_id='dummy', dag=subdag, pool='test_pool_1') self.assertRaises(AirflowException, SubDagOperator, task_id='child', dag=dag, subdag=subdag, pool='test_pool_1') # recreate dag because failed subdagoperator was already added dag = DAG('parent', default_args=default_args) SubDagOperator(task_id='child', dag=dag, subdag=subdag, pool='test_pool_10') session.delete(pool_1) session.delete(pool_10) session.commit()
def test_subdag_name(self): """ Subdag names must be {parent_dag}.{subdag task} """ dag = DAG('parent', default_args=default_args) subdag_good = DAG('parent.test', default_args=default_args) subdag_bad1 = DAG('parent.bad', default_args=default_args) subdag_bad2 = DAG('bad.test', default_args=default_args) subdag_bad3 = DAG('bad.bad', default_args=default_args) SubDagOperator(task_id='test', dag=dag, subdag=subdag_good) self.assertRaises(AirflowException, SubDagOperator, task_id='test', dag=dag, subdag=subdag_bad1) self.assertRaises(AirflowException, SubDagOperator, task_id='test', dag=dag, subdag=subdag_bad2) self.assertRaises(AirflowException, SubDagOperator, task_id='test', dag=dag, subdag=subdag_bad3)
def subdag_task(database): sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, database, '@once'), task_id=database, dag=main_dag, pool='Pool_max_parallel_500', executor=LocalExecutor()) return sub_dag
task_id='test_depends_on_past_2', depends_on_past=True, dag=dag6, ) dag6_task2.set_upstream(dag6_task1) # DAG tests that a deadlocked subdag is properly caught dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args) subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args) subdag7_task1 = PythonOperator(task_id='test_subdag_fail', dag=subdag7, python_callable=fail) subdag7_task2 = DummyOperator( task_id='test_subdag_dummy_1', dag=subdag7, ) subdag7_task3 = DummyOperator(task_id='test_subdag_dummy_2', dag=subdag7) dag7_subdag1 = SubDagOperator(task_id='subdag', dag=dag7, subdag=subdag7) subdag7_task1.set_downstream(subdag7_task2) subdag7_task2.set_downstream(subdag7_task3) # DAG tests that queued tasks are run dag8 = DAG(dag_id='test_scheduled_queued_tasks', start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, default_args=default_args) dag8_task1 = PythonOperator(python_callable=fail, task_id='test_queued_task', dag=dag8, pool='test_queued_pool')
def __new__( cls, parent_id, gcs_dirs_xcom, dst_dir, filename, schema_fields, table_name, task_id, dag, ): from airflow.utils.dates import days_ago args = { "start_date": days_ago(2), } bucket = get_bucket().replace("gs://", "", 1) full_table_name = format_table_name(table_name, is_staging=True) subdag = DAG(dag_id=f"{parent_id}.{task_id}", default_args=args) column_names = [schema["name"] for schema in schema_fields] # by convention, preface task names with dag_id op_col_select = PythonTaskflowOperator( task_id="select_cols", python_callable=_keep_columns, # note that this input should have form schedule/{execution_date}/... taskflow={ "gcs_dirs": { "dag_id": parent_id, "task_ids": gcs_dirs_xcom } }, op_kwargs={ "dst_dir": dst_dir, "filename": filename, "required_cols": [], "optional_cols": column_names, }, dag=subdag, ) op_stage_bq = GoogleCloudStorageToBigQueryOperator( task_id="stage_bigquery", bucket=bucket, # note that we can't really pull a list out of xcom without subclassing # operators, so we really on knowing that the task passing in # gcs_dirs_xcom data is using schedule/{execution_date} source_objects=[ "schedule/{{execution_date}}/*/%s/%s" % (dst_dir, filename) ], schema_fields=schema_fields, destination_project_dataset_table=full_table_name, create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", # _keep_columns function includes headers in output skip_leading_rows=1, dag=subdag, ) op_col_select >> op_stage_bq return SubDagOperator(subdag=subdag, dag=dag, task_id=task_id)
start_task >> dt_s3 dt_s3 >> dt_sf dt_sf >> end return one_dag ############################################################################# #Defining Main Dag structure ############################################################################# main_dag = DAG( dag_id=parent_dag_name, default_args=default_args, schedule_interval='@once' #schedule_interval=timedelta(minutes=5), #max_active_runs=1 , concurrency=35) database_list = ['database'] #Each database is an independant task that will run in parallel4 for i in database_list: sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, i, '@once'), task_id=i, dag=main_dag, pool='Pool_max_parallel_5', executor=LocalExecutor())
bucket='s3://udacity-dend/song_data', table='staging_songs', queries=SqlQueries, json_format='auto', timestamped=False, provide_context=True) load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table', dag=dag, redshift_conn_id='redshift', queries=SqlQueries) load_dimension_tables = SubDagOperator( task_id='Load_dimension_tables', subdag=load_dimension_table_subdag('final_dag', 'Load_dimension_tables', default_args, ['users', 'songs', 'artists', 'time']), default_args=default_args, dag=dag, ) run_quality_checks = SubDagOperator( task_id='Run_data_quality_checks', subdag=run_data_quality_subdag( 'final_dag', 'Run_data_quality_checks', default_args, [{ 'table': 'songplays', 'tests': [{ 'check': 'SQL_COUNT', 'operator': '>', 'result': 0 }]
############################################################################# #Defining Main Dag structure ############################################################################# main_dag = DAG( dag_id=parent_dag_name, default_args=default_args, schedule_interval='@once' #schedule_interval=timedelta(minutes=5), #max_active_runs=1 ) database_list = get_database_list(database_include_patterns) #Each database is an independant task that will run in parallel4 for i in database_list: sub_dag = SubDagOperator( subdag = database_sub_dag(parent_dag_name, i, '@once'), task_id= i, dag=main_dag, )
rescheduling loop. """ from datetime import datetime from airflow.models import DAG from airflow.operators import SubDagOperator from airflow.example_dags.subdags.subdag import subdag args = { 'owner': 'airflow', 'start_date': datetime(2016, 1, 1), } dag = DAG( dag_id='test_raise_executor_error', default_args=args, schedule_interval="@daily", ) section_1 = SubDagOperator( task_id='subdag_op', subdag=subdag('test_raise_executor_error', 'subdag_op', args), default_args=args, dag=dag, ) # change the subdag name -- this creates an error because the subdag # won't be found, but it'll do it in a way that causes the executor to report # success section_1.subdag.dag_id = 'bad_id'
def make_task_group(dagname, name, path, pdag, trigger_rule): args = { 'owner': 'deploy', 'start_date': datetime.strptime( (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'), '%Y-%m-%d') } dag = DAG(dag_id=dagname, default_args=args, schedule_interval=timedelta(hours=24)) sdOp = SubDagOperator(task_id=name, subdag=dag, dag=pdag, trigger_rule=trigger_rule) start = DummyOperator(task_id=name + '_start', dag=dag) end = DummyOperator(task_id=name + '_end', dag=dag) task_dict = {} task_dict[name + '_start'] = start task_dict[name + '_end'] = end config_str = open(path, 'r').read() config_arr = config_str.split( '[dependency]' ) # a -> b means data flow from a to b, though not valid ini format, but i think it is better to understand config_fp = StringIO.StringIO(config_arr[0]) config = ConfigParser.RawConfigParser() config.readfp(config_fp) sections = config.sections() #print sections for section in sections: #print section if section != 'dependency': options = config.options(section) #print options if 'type' not in options or 'cmd' not in options: continue operator_type = config.get(section, 'type').strip() operator_cmd = config.get(section, 'cmd').strip() if operator_type == 'PostgresOperator': task = PostgresOperator(task_id=section.strip(), depends_on_past=False, postgres_conn_id='postgres_sha2dw03', sql=operator_cmd, dag=dag) task_dict[section.strip()] = task task.set_downstream(end) task.set_upstream(start) elif operator_type == 'BashOperator': task = BashOperator(task_id=section.strip(), depends_on_past=False, bash_command=operator_cmd, dag=dag) task_dict[section.strip()] = task task.set_downstream(end) task.set_upstream(start) else: print "Error: currently not support %s operator type" % operator_type if len(config_arr) == 1: return (start, end, dag, sdOp, task_dict) for line in config_arr[1].split('\n'): arr = line.split('->') if len(arr) != 2: continue left_side = arr[0].strip() right_side = arr[1].strip() if left_side in task_dict.keys() and right_side in task_dict.keys(): if end in task_dict[left_side].downstream_list: task_dict[left_side].downstream_list.remove(end) task_dict[left_side].set_downstream(task_dict[right_side]) if start in task_dict[right_side].upstream_list: task_dict[right_side].upstream_list.remove(start) if task_dict[right_side] in start.downstream_list: start.downstream_list.remove(task_dict[right_side]) if task_dict[left_side] in end.upstream_list: end.upstream_list.remove(task_dict[left_side]) return (start, end, dag, sdOp, task_dict)
dbt_task = BashOperator( task_id=model_name, bash_command= 'cd ~/gospel && dbt run --profile=warehouse --target=prod --non-destructive --models {simple_model_name}' .format(simple_model_name=simple_model_name), dag=temp_dag) return dbt_task dbt_tasks = {} for node_name in set(G.nodes()): dbt_task = make_dbt_task(node_name) dbt_tasks[node_name] = dbt_task for edge in G.edges(): dbt_tasks[edge[0]].set_downstream(dbt_tasks[edge[1]]) return temp_dag dbt_sub_dag = SubDagOperator(subdag=dbt_dag(dag.start_date, dag.schedule_interval, default_args=default_args), task_id='dbt_sub_dag', dag=dag, trigger_rule='all_done') dbt_sub_dag.set_upstream(copy_gpickle_file) dbt_test = BashOperator( task_id='dbt_test', bash_command='cd ~/project && dbt test --profile=warehouse --target=prod', dag=dag) dbt_test.set_upstream(dbt_sub_dag)
dag = DAG( dag_id=DAG_NAME, default_args=args, schedule_interval="@once", ) start = DummyOperator( task_id='start', default_args=args, dag=dag, ) section_1 = SubDagOperator( task_id='section-1', subdag=subdag(DAG_NAME, 'section-1', args), default_args=args, dag=dag, ) some_other_task = DummyOperator( task_id='some-other-task', default_args=args, dag=dag, ) section_2 = SubDagOperator( task_id='section-2', subdag=subdag(DAG_NAME, 'section-2', args), default_args=args, dag=dag, )
schedule_interval='@once') # define "tripDataSubdag" variable tripDataSubdag = 'tripDataSubdag' # NOTICE a SubDag is actually a Task, an instance of "SubDagOperator" tripsSubdagTask = SubDagOperator( task_id=tripDataSubdag #-------------------------------------------------------------------------- # ATTENTION! # - "subdag" is a mandatory "SubDagOperator" parameter # - "subdag" receives a Python Factory function as input #-------------------------------------------------------------------------- , subdag=redshift_table_ddl_plus_bulk_load( parent_dag_id=parentDagId, parent_task_id=tripDataSubdag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table='trips', create_table_statement=sql_statements.CREATE_TRIPS_TABLE_SQL, s3_bucket='udacity-dend', s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv', start_date=startDate), dag=mainDag) # define "stationDataSubdag" variable stationDataSubdag = 'stationDataSubdag' # Remember the "Subdag/Task Parity" stationDataSubdagTask = SubDagOperator(
True, "approved": True, "inProg": False, "done": False, "approvedBy": "karakuri", "workflow": workflow_id }) print("TASKS: ", tasks) dag = DAG('sfsc_review_new_airflow_process_tasks', default_args=default_args, schedule_interval=None) start = DummyOperator(task_id='start', default_args=default_args, dag=dag) process = SubDagOperator( task_id='process', subdag=subdag_tasks('sfsc_review_new_airflow_process_tasks', 'process', tasks, default_args), default_args=default_args, dag=dag, ) start >> process
"export_data_hql": export_table2_to_gcp_hql, "add_dataproc_partition_hql": add_dataproc_table2_partition_hql, "drop_tmp_table_hql": drop_tmp_table2_hql } } } export_table_params = { 'gcp_warehouse_path': persist_cfg['gcp_warehouse']['casesci_prd'], 'gcp_keyfile': persist_cfg['gcp_keyfile']['casesci_prd'] } export_table_dataproc_config = { "dataproc_cluster": persist_cfg["gcp_prod_project"]["dataproc_cluster"], "region": persist_cfg["gcp_prod_project"]["region"], "gcp_conn_id": persist_cfg["gcp_prod_project"]["gcp_conn_id"] } for op_name in export_table_dict: export_table_dict[op_name]["operator"] = SubDagOperator( subdag=export_to_gcp_dag("crmdata_bids_bounds." + op_name, sync_data2GCP_dag.schedule_interval, persist_cfg['queue'], sync_data2GCP_dag.default_args, export_table_dict[op_name]["hql_dict"], export_table_params, export_table_dataproc_config), task_id=op_name, queue=persist_cfg['queue'], dag=sync_data2GCP_dag)
gen_search_terms = BranchPythonOperator(task_id='generate_search_terms', provide_context=True, python_callable=generate_search_terms, dag=dag) email_links = EmailOperator( task_id='email_best_links', to='*****@*****.**', subject='Latest popular links', html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) sub = SubDagOperator(subdag=subdag, task_id='insert_and_id_pop', trigger_rule='one_success', dag=dag) clear_latest = BashOperator( bash_command='rm -rf {}/latest_links.txt'.format(RAW_TWEET_DIR), task_id='clear_latest', dag=dag) gen_search_terms.set_upstream(fill_search_terms) for term in SEARCH_TERMS: term_without_punctuation = re.sub(r'\W+', '', term) simple_search = PythonOperator( task_id='search_{}_twitter'.format(term_without_punctuation), provide_context=True, python_callable=search_twitter,