def apply_task_to_dag(self): check_cloudformation_stack_exists_task = BranchPythonOperator( templates_dict={'stack_name': self.stack_name}, task_id=f'is-cloudformation-{self.task_id}-running', python_callable=self.__cloudformation_stack_running_branch, provide_context=True, dag=self.dag) create_cloudformation_stack_task = CloudFormationCreateStackOperator( task_id=f'create-cloudformation-{self.task_id}', params={**self.__reformatted_params()}, dag=self.dag) create_stack_sensor_task = CloudFormationCreateStackSensor( task_id=f'cloudformation-watch-{self.task_id}-create', stack_name=self.stack_name, dag=self.dag) stack_creation_end_task = DummyOperator( task_id=f'creation-end-{self.task_id}', dag=self.dag, trigger_rule='all_done') if self.parent: self.parent.set_downstream(check_cloudformation_stack_exists_task) create_stack_sensor_task.set_downstream(stack_creation_end_task) create_cloudformation_stack_task.set_downstream( create_stack_sensor_task) check_cloudformation_stack_exists_task.set_downstream( create_cloudformation_stack_task) check_cloudformation_stack_exists_task.set_downstream( stack_creation_end_task) return stack_creation_end_task
def apply_task_to_dag(self): check_dags_queued_task = BranchPythonOperator( task_id=f'{self.task_id}-is-dag-queue-empty', python_callable=self.__queued_dag_runs_exists, provide_context=True, trigger_rule=TriggerRule.ALL_DONE, dag=self.dag) delete_stack_task = CloudFormationDeleteStackOperator( task_id=f'delete-cloudformation-{self.task_id}', params={'StackName': self.stack_name}, dag=self.dag) delete_stack_sensor = CloudFormationDeleteStackSensor( task_id=f'cloudformation-watch-{self.task_id}-delete', stack_name=self.stack_name, dag=self.dag) stack_delete_end_task = DummyOperator( task_id=f'delete-end-{self.task_id}', dag=self.dag) if self.parent: self.parent.set_downstream(check_dags_queued_task) check_dags_queued_task.set_downstream(stack_delete_end_task) check_dags_queued_task.set_downstream(delete_stack_task) delete_stack_task.set_downstream(delete_stack_sensor) delete_stack_sensor.set_downstream(stack_delete_end_task) return stack_delete_end_task
PROJECT_PATH)) # Ref: https://airflow.apache.org/macros.html for the jinja variables used below. EXP_CRAWL_COMMAND = Template(""" cd $project_path && scrapy crawl treasury_expenditures -a start={{ ds_nodash }} -a end={{ ds_nodash }} """) EXP_CRAWL_TASK = BashOperator( task_id='crawl_expenditure', bash_command=EXP_CRAWL_COMMAND.substitute(project_path=PROJECT_PATH), trigger_rule='none_failed') REC_CRAWL_COMMAND = Template(""" cd $project_path && scrapy crawl treasury_receipts -a start={{ ds_nodash }} -a end={{ ds_nodash }} """) REC_CRAWL_TASK = BashOperator( task_id='crawl_receipts', bash_command=REC_CRAWL_COMMAND.substitute(project_path=PROJECT_PATH), trigger_rule='none_failed') BRANCH_OP = BranchPythonOperator(task_id='branch_task', provide_context=True, python_callable=branch_tasks, dag=dag) CREATE_DIR.set_downstream(BRANCH_OP) BRANCH_OP.set_downstream([CRAWL_DDO_CODES, EXP_CRAWL_TASK, REC_CRAWL_TASK]) CRAWL_DDO_CODES.set_downstream(EXP_CRAWL_TASK) CRAWL_DDO_CODES.set_downstream(REC_CRAWL_TASK)
# launch sagemaker hyperparameter job and wait until it completes tune_model_task = SageMakerTuningOperator(task_id='model_tuning', dag=dag, config=tuner_config, aws_conn_id='airflow-sagemaker', wait_for_completion=True, check_interval=30) # launch sagemaker batch transform job and wait until it completes batch_transform_task = SageMakerTransformOperator( task_id='predicting', dag=dag, config=transform_config, aws_conn_id='airflow-sagemaker', wait_for_completion=True, check_interval=30, trigger_rule=TriggerRule.ONE_SUCCESS) cleanup_task = DummyOperator(task_id='cleaning_up', dag=dag) # set the dependencies between tasks init.set_downstream(preprocess_task) preprocess_task.set_downstream(prepare_task) prepare_task.set_downstream(branching) branching.set_downstream(tune_model_task) branching.set_downstream(train_model_task) tune_model_task.set_downstream(batch_transform_task) train_model_task.set_downstream(batch_transform_task) batch_transform_task.set_downstream(cleanup_task)
if version_path in [v['name'] for v in model_versions]: logging.info("Delete previously version of the model to overwrite.") mle.delete_version(PROJECT, model_name, version_params['name']) mle.create_version(PROJECT, model_name, version_params) # Checks if model exists using Hook instead of GCP operators due to conditional. t10 = PythonOperator( task_id='list_versions', dag=dag, python_callable=do_list_versions) # Creates model if it does not exist using Hook instead of GCP operators t11 = PythonOperator( task_id='create_version', dag=dag, python_callable=do_create_version) # Create task graph t1.set_downstream(t2) t2.set_downstream(t3) t3.set_downstream(t4_train_cond) t4_train_cond.set_downstream(t4_ml_engine) t4_ml_engine.set_downstream([t4a, t4b, t4c]) t4_ml_engine.set_downstream(t5d) t4a.set_downstream(t5a) t4b.set_downstream(t5b) t4c.set_downstream(t5c) t6.set_upstream([t5a, t5b, t5c, t5d]) t6.set_downstream(t7) t7.set_downstream(t8) t9.set_upstream(t8) t9.set_downstream(t10) t10.set_downstream(t11)
import calendar default_args={ 'owner':'ankityadav', 'depends_upon_past':False, 'retries':1, 'start_date':datetime(2019,1,14) } dag=DAG('batch_operator',default_args=default_args,schedule_interval='@once') months=["January","February","March","April","May","June","July","August","September","October","November","December"] def branch(): for m in months: if (m == calendar.month_name[datetime.now().month]): return 'task_for_'+m hello=BranchPythonOperator( task_id='branching', python_callable=branch, provide_context=False, dag=dag ) for month in months: hello.set_downstream(DummyOperator(task_id='task_for_' + month, dag=dag))
def _print_high(): return 'HIGH' default_args = { 'owner': 'pedro', 'retries': 0, 'start_date': datetime(2020, 4, 10) } with DAG('random_number_extended', default_args=default_args, schedule_interval='0 4 * * *') as dag: dummy_start_task = DummyOperator(task_id=f'dummy_start') generate_random_number = BashOperator(task_id='generate_random_number', bash_command=bash_cmd) # New branch operator read_num_and_square = BranchPythonOperator( task_id='read_number_and_square_it', python_callable=_read_number_and_square, op_args=[STORE_DIR], provide_context=True, # pass task instance params to python callable ) print_high = PythonOperator(task_id='print_high', python_callable=_print_high) print_low = BashOperator(task_id='print_low', bash_command='echo LOW') # Define tasks (normal path and then each branch) dummy_start_task >> generate_random_number >> read_num_and_square >> print_high read_num_and_square.set_downstream(print_low)
serve_branch = BranchPythonOperator(task_id="serve_or_not", python_callable=model_exist, dag=dag) t3 = KubernetesPodOperator( namespace="default", name="{}-restapi".format(model_name.lower()), image="tensorflow/serving:latest", env_vars={ 'MODEL_NAME': model_name, 'MODEL_BASE_PATH': '/root/runtime/models' }, task_id="serve_model", port=8501, dag=dag, async=True, in_cluster=True, labels={'name': '{}-restapi'.format(model_name.lower())}, volume_mounts=[volume_mount], volumes=[volume]) t4 = DummyOperator(task_id="update_version_or_not_serve", dag=dag) t0.set_downstream(cleanup_branch) cleanup_branch.set_downstream(t1) cleanup_branch.set_downstream(t2) t2.set_downstream(serve_branch) serve_branch.set_downstream(t3) serve_branch.set_downstream(t4)
'type': 'STRING' }, { 'name': 'predicted_monetary', 'type': 'FLOAT' }, { 'name': 'predictions', 'type': 'FLOAT' }], source_format="NEWLINE_DELIMITED_JSON", skip_leading_rows=1, destination_project_dataset_table="{}.{}.{}".format( PROJECT, dataset, 'predictions'), create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE", dag=dag).execute(kwargs) t3 = PythonOperator(task_id='list_predictions_files', dag=dag, python_callable=do_list_predictions_files) t4 = PythonOperator(task_id='load_to_bq', dag=dag, python_callable=do_load_to_bq) # How to link them t0_predict_cond.set_downstream([t1a, t1b]) t2.set_upstream([t1a, t1b]) t3.set_upstream([t1a, t1b]) t3.set_downstream(t4)
f"{BASE_PACKAGE}.transactional-tables", "OutletsByDate", dag, RETAIL_ID, schema_name, ENV_TYPE, ) items_by_date_task = bash_operator_for_spark_submit( f"{BASE_PACKAGE}.transactional-tables", "ItemsByDate", dag, RETAIL_ID, schema_name, ENV_TYPE, ) push_instruments.set_downstream(push_server_details) branch_task.set_upstream(push_server_details) branch_task.set_downstream(master_tables_load) branch_task.set_downstream(history_load_done) master_tables_load.set_downstream(create_table_structure) history_load_done.set_downstream(create_table_structure) create_table_structure.set_downstream(unix_chmod_task) unix_chmod_task.set_downstream(market_baskets_task) market_baskets_task.set_downstream( [transaction_line_item_task, outlets_by_date_task, items_by_date_task]) data_load_done.set_upstream( [transaction_line_item_task, outlets_by_date_task, items_by_date_task]) create_constraint_task.set_upstream(data_load_done)
def create_dag(): dag = DAG( DAG_ID, default_args=default_args, # Be sure to stagger the dags so they don't run all at once, # possibly causing max memory usage and pod failure. - Stu M. schedule_interval='30 * * * *', catchup=False) with dag: start_task = DummyOperator(task_id='start') finish_task = DummyOperator(task_id='finish') for table, sources in table_map.items(): pusher_task_id = f'schedule_dataflow_{table}' parsed_table = gcloud.parse_table_name(table) get_checkpoint_task = GetCheckpointOperator( task_id=f'get_checkpoint_{table}', env=env, target=table, sources=sources) continue_if_data_task = BranchPythonOperator( task_id=f'continue_if_data_{table}', python_callable=should_continue, op_args=[table], provide_context=True) parse_query_task = PythonOperator(task_id=f'parse_query_{table}', python_callable=parse_query, op_args=[table], provide_context=True) dataflow_task = ScheduleDataflowJobOperator( task_id=pusher_task_id, project=gcloud.project(env), template_name=f'load_lake_to_staging_{parsed_table}', job_name=f'lake-to-staging-{table}', job_parameters={'env': env}, pull_parameters=[{ 'param_name': 'query', 'task_id': f'parse_query_{table}' }], provide_context=True) monitor_dataflow_task = DataflowJobStateSensor( task_id=f'monitor_df_job_{table}', poke_interval=airflow_vars['dags']['lake_to_staging'] ['poke_interval'], timeout=airflow_vars['dags']['lake_to_staging'] ['poke_timeout'], dag=dag, pusher_task_id=pusher_task_id) set_checkpoint_task = SetCheckpointOperator( task_id=f'set_checkpoint_{table}', env=env, table=table) start_task.set_downstream(get_checkpoint_task) get_checkpoint_task.set_downstream(continue_if_data_task) continue_if_data_task.set_downstream(parse_query_task) parse_query_task.set_downstream(dataflow_task) dataflow_task.set_downstream(monitor_dataflow_task) monitor_dataflow_task.set_downstream(set_checkpoint_task) set_checkpoint_task.set_downstream(finish_task) start_task >> finish_task return dag
provide_context=True, dag=dag ) create_source_id.set_upstream(source_data_sensor) clean_data = HiveOperator( task_id='clean_data', hql=hql.HQL_CLEAN_DATA.format(source_id="{{ task_instance.xcom_pull(task_ids='create_source_id') }}", clean_mydata='clean_mydata', mydata='mydata'), schema='my_hive_db', provide_context=True, dag=dag ) clean_data.set_upstream(create_source_id) count_data_rows.set_downstream([stop_flow, clean_data]) move_data_mysql = PythonOperator( task_id='move_data_mysql', python_callable=tasks.move_data_mssql, templates_dict={'schema': 'my_hive_db'}, provide_context=True, dag=dag ) move_data_mysql.set_upstream(clean_data) send_email = EmailOperator( task_id='send_email', to='*****@*****.**',
mle.create_version(PROJECT, model_name, version_params) # Checks if model exists using Hook instead of GCP operators due to conditional. t10 = PythonOperator(task_id='list_versions', dag=dag, python_callable=do_list_versions) # Creates model if it does not exist using Hook instead of GCP operators t11 = PythonOperator(task_id='create_version', dag=dag, python_callable=do_create_version) # Create task graph t1.set_downstream(t2) t2.set_downstream(t3) t3.set_downstream(t4_train_cond) t4_train_cond.set_downstream([t4_ml_engine, t4_automl]) t4_ml_engine.set_downstream([t4a, t4b, t4c]) t4_ml_engine.set_downstream(t5d) t4a.set_downstream(t5a) t4b.set_downstream(t5b) t4c.set_downstream(t5c) t6.set_upstream([t5a, t5b, t5c, t5d]) t6.set_downstream(t7) t7.set_downstream(t8) t9.set_upstream(t8) t9.set_downstream(t10) t10.set_downstream(t11)
return True else: return False validate_data = BranchPythonOperator( task_id='Validate_Data', python_callable=lambda: "export_investor_report_s3" if is_data_valid() else "notify_validation_failure", dag=dag) export_report = PostgresUnloaderOperator( task_id="export_investor_report_s3", dag=dag, postgres_conn_id=EDW.get_conn_id(), source= "select * from loanreview.loan_in_review WHERE cast(update_date as DATE) = current_date - 1", uri=EDW.get_s3_stage_uri(path="lir.csv"), execution_timeout=timedelta(minutes=30)) notify_slack_validation_fail = SlackAPIPostOperator( message="Validation failure. Cannot export {0} to s3".format( "investor report"), task_id="notify_validation_failure", dag=dag, execution_timeout=timedelta(minutes=30)) start.set_downstream(refresh_table) refresh_table.set_downstream(validate_data) validate_data.set_downstream([export_report, notify_slack_validation_fail])
python_callable=get_endpoint, op_args=[e, SAVE_PATH, BASE_URL, API_KEYS], ) t_branch = BranchPythonOperator(task_id=branch_task_id, python_callable=row_count_branch, op_args=[ get_enpdpoints_task_id, file_to_gcs_task_id, zero_branch_task_id ], trigger_rule="all_done") t_gcs = FileToGoogleCloudStorageOperator( task_id=file_to_gcs_task_id, google_cloud_storage_conn_id='gcs_silo', bucket="deanslist", src="{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_path' )}}", #dst = "TEST/" + endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}", dst=endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}", dag=dag) t_zero_row = DummyOperator(task_id=zero_branch_task_id) t2.set_upstream(t1) t2.set_downstream(t_branch) t_branch.set_downstream(t_gcs) t_branch.set_downstream(t_zero_row)
# returns the week day (monday, tuesday, etc.) def get_day(**kwargs): print(kwargs['ti']) kwargs['ti'].xcom_push(key='day', value=datetime.now().weekday()) # returns the name id of the task to launch (task_for_monday, task_for_tuesday, etc.) def branch(**kwargs): print(kwargs) return 'task_for_' + tabDays[kwargs['ti'].xcom_pull(task_ids='weekday', key='day')] # PythonOperator will retrieve and store into "weekday" variable the week day get_weekday = PythonOperator(task_id='weekday', python_callable=get_day, provide_context=True, dag=dag) # BranchPythonOperator will use "weekday" variable, and decide which task to launch next fork = BranchPythonOperator(task_id='branching', python_callable=branch, provide_context=True, dag=dag) # task 1, get the week day get_weekday.set_downstream(fork) # One dummy operator for each week day, all branched to the fork for day in range(0, 6): fork.set_downstream( DummyOperator(task_id='task_for_' + tabDays[day], dag=dag))
task_id='ingesting_row_data_HIVE', default_args=default_args, dag=dag, ) #SparkSubmitOperator( ingesting_graph_HBASE = DummyOperator( task_id='ingesting_graph_HBASE', default_args=default_args, dag=dag, ) def decide_which_path(): if True: return "branch_a" else: return "branch_b" branch_checkForChanges = BranchPythonOperator( task_id='check_data_availablity', python_callable=decide_which_path, trigger_rule="all_done", dag=dag) branch_checkForChanges.set_downstream(ingesting_row_data_HIVE) branch_checkForChanges.set_downstream(sendNotification) ingesting_row_data_HIVE.set_downstream(ingesting_graph_HBASE) ingesting_graph_HBASE.set_downstream(sendNotification)
branch_b = PythonOperator( task_id='branch_b', python_callable=print_branchb, dag=dag) # 指定归属的dag def print_branchc(): return 'Hello branchc!' branch_c = PythonOperator( task_id='branch_c', python_callable=print_branchc, dag=dag) # 指定归属的dag #------------------------------------------------------------------------------- def decide_which_path(): if 1 > 1: return "branch_a" else: return "branch_b" branch_task = BranchPythonOperator( task_id='run_this_first', python_callable=decide_which_path, trigger_rule="all_done", dag=dag) #------------------------------------------------------------------------------- # dependencies branch_task.set_downstream(branch_a) #适配层以及中间层、应用层都依赖于branch_a branch_task.set_downstream(branch_b) branch_a.set_downstream(branch_c)
python_callable=WriteToFile, op_kwargs={ 'file_name': 'empty_log.log', 'chk': False }, provide_context=True, email=['*****@*****.**'], email_on_failure=True, dag=dag) collect_data = PythonOperator(task_id='collect_data', depends_on_past=False, python_callable=WriteToFile, op_kwargs={ 'file_name': 'data_log.log', 'chk': True }, email=['*****@*****.**'], provide_context=True, email_on_failure=True, dag=dag) # Forking based on the condition fork = BranchPythonOperator(task_id='branching', python_callable=branching, provide_context=True, dag=dag) check_data.set_downstream(fork) fork.set_downstream(collect_data) fork.set_downstream(empty_table)