def test_trigger_dagrun_twice(self): """Test TriggerDagRunOperator with custom execution_date.""" utc_now = timezone.utcnow() task = TriggerDagRunOperator( task_id="test_trigger_dagrun_with_execution_date", trigger_dag_id=TRIGGERED_DAG_ID, execution_date=utc_now, dag=self.dag, poke_interval=1, reset_dag_run=True, wait_for_completion=True, ) run_id = f"manual__{utc_now.isoformat()}" with create_session() as session: dag_run = DagRun( dag_id=TRIGGERED_DAG_ID, execution_date=utc_now, state=State.SUCCESS, run_type="manual", run_id=run_id, ) session.add(dag_run) session.commit() task.execute(None) dagruns = session.query(DagRun).filter( DagRun.dag_id == TRIGGERED_DAG_ID).all() self.assertEqual(len(dagruns), 1) self.assertTrue(dagruns[0].external_trigger) self.assertEqual(dagruns[0].execution_date, utc_now)
def test_trigger_dagrun(self): """Test TriggerDagRunOperator.""" task = TriggerDagRunOperator(task_id="test_task", trigger_dag_id=TRIGGERED_DAG_ID, dag=self.dag) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with create_session() as session: dagruns = session.query(DagRun).filter( DagRun.dag_id == TRIGGERED_DAG_ID).all() self.assertEqual(len(dagruns), 1) self.assertTrue(dagruns[0].external_trigger)
def test_trigger_dagrun_with_wait_for_completion_true_fail(self): """Test TriggerDagRunOperator with wait_for_completion but triggered dag fails.""" execution_date = DEFAULT_DATE task = TriggerDagRunOperator( task_id="test_task", trigger_dag_id=TRIGGERED_DAG_ID, execution_date=execution_date, wait_for_completion=True, poke_interval=10, failed_states=[State.RUNNING], dag=self.dag, ) with self.assertRaises(AirflowException): task.run(start_date=execution_date, end_date=execution_date)
def test_trigger_dagrun_operator_templated_conf(self): """Test passing a templated conf to the triggered DagRun.""" task = TriggerDagRunOperator( task_id="test_trigger_dagrun_with_str_execution_date", trigger_dag_id=TRIGGERED_DAG_ID, conf={"foo": "{{ dag.dag_id }}"}, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with create_session() as session: dagruns = session.query(DagRun).filter( DagRun.dag_id == TRIGGERED_DAG_ID).all() self.assertEqual(len(dagruns), 1) self.assertTrue(dagruns[0].conf, {"foo": TEST_DAG_ID})
def create_retrigger_operator(dag, task_id=None): if not task_id: task_id = f'retrigger_{dag.dag_id}' return TriggerDagRunOperator( task_id=task_id, trigger_dag_id=dag.dag_id, dag=dag )
def test_trigger_dagrun_with_templated_execution_date(self): """Test TriggerDagRunOperator with templated execution_date.""" task = TriggerDagRunOperator( task_id="test_trigger_dagrun_with_str_execution_date", trigger_dag_id=TRIGGERED_DAG_ID, execution_date="{{ execution_date }}", dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with create_session() as session: dagruns = session.query(DagRun).filter( DagRun.dag_id == TRIGGERED_DAG_ID).all() self.assertEqual(len(dagruns), 1) self.assertTrue(dagruns[0].external_trigger) self.assertEqual(dagruns[0].execution_date, DEFAULT_DATE)
def test_trigger_dagrun_with_wait_for_completion_true(self): """Test TriggerDagRunOperator with wait_for_completion.""" execution_date = DEFAULT_DATE task = TriggerDagRunOperator( task_id="test_task", trigger_dag_id=TRIGGERED_DAG_ID, execution_date=execution_date, wait_for_completion=True, poke_interval=10, allowed_states=[State.RUNNING], dag=self.dag, ) task.run(start_date=execution_date, end_date=execution_date) with create_session() as session: dagruns = session.query(DagRun).filter( DagRun.dag_id == TRIGGERED_DAG_ID).all() self.assertEqual(len(dagruns), 1)
def test_trigger_dagrun_with_execution_date(self): """Test TriggerDagRunOperator with custom execution_date.""" utc_now = timezone.utcnow() task = TriggerDagRunOperator( task_id="test_trigger_dagrun_with_execution_date", trigger_dag_id=TRIGGERED_DAG_ID, execution_date=utc_now, dag=self.dag, ) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_ti_state=True) with create_session() as session: dagruns = session.query(DagRun).filter( DagRun.dag_id == TRIGGERED_DAG_ID).all() self.assertEqual(len(dagruns), 1) self.assertTrue(dagruns[0].external_trigger) self.assertEqual(dagruns[0].execution_date, utc_now)
def test_trigger_dagrun_operator_templated_invalid_conf(self): """Test passing a conf that is not JSON Serializable raise error.""" with pytest.raises( AirflowException, match="^conf parameter should be JSON Serializable$"): TriggerDagRunOperator( task_id="test_trigger_dagrun_with_invalid_conf", trigger_dag_id=TRIGGERED_DAG_ID, conf={ "foo": "{{ dag.dag_id }}", "datetime": timezone.utcnow() }, dag=self.dag, )
def test_trigger_dagrun_with_reset_dag_run_false(self): """Test TriggerDagRunOperator with reset_dag_run.""" execution_date = DEFAULT_DATE task = TriggerDagRunOperator( task_id="test_task", trigger_dag_id=TRIGGERED_DAG_ID, execution_date=execution_date, reset_dag_run=False, dag=self.dag, ) task.run(start_date=execution_date, end_date=execution_date, ignore_ti_state=True) with self.assertRaises(DagRunAlreadyExists): task.run(start_date=execution_date, end_date=execution_date, ignore_ti_state=True)
def test_trigger_dagrun_with_reset_dag_run_true(self): """Test TriggerDagRunOperator with reset_dag_run.""" execution_date = DEFAULT_DATE task = TriggerDagRunOperator( task_id="test_task", trigger_dag_id=TRIGGERED_DAG_ID, execution_date=execution_date, reset_dag_run=True, dag=self.dag, ) task.run(start_date=execution_date, end_date=execution_date, ignore_ti_state=True) task.run(start_date=execution_date, end_date=execution_date, ignore_ti_state=True) with create_session() as session: dagruns = session.query(DagRun).filter( DagRun.dag_id == TRIGGERED_DAG_ID).all() assert len(dagruns) == 1 assert dagruns[0].external_trigger
# ================================================ EXAMPLE 1 ================================================= example_1_dag_1 = DAG( dag_id="figure_6_17_example_1_dag_1", start_date=airflow.utils.dates.days_ago(3), schedule_interval="0 0 * * *", ) example_1_dag_2 = DAG( dag_id="figure_6_17_example_1_dag_2", start_date=airflow.utils.dates.days_ago(3), schedule_interval=None, ) DummyOperator(task_id="etl", dag=example_1_dag_1) >> TriggerDagRunOperator( task_id="trigger_dag2", trigger_dag_id="figure_6_17_example_1_dag_2", dag=example_1_dag_1, ) PythonOperator(task_id="report", dag=example_1_dag_2, python_callable=lambda: print("hello")) # ================================================ EXAMPLE 2 ================================================= example_2_dag_1 = DAG( dag_id="figure_6_17_example_2_dag_1", start_date=airflow.utils.dates.days_ago(3), schedule_interval="0 0 * * *", ) example_2_dag_2 = DAG( dag_id="figure_6_17_example_2_dag_2",
from airflow.utils.dates import days_ago with DAG( dag_id="controller_dag_to_trigger_other_dags", default_args={"owner": "airflow"}, start_date=days_ago(1), schedule_interval="@once", ) as dag: start = DummyOperator( task_id='start' ) trigger_1 = TriggerDagRunOperator( task_id="dag_1", trigger_dag_id="dag-to-trigger", # Ensure this equals the dag_id of the DAG to trigger conf={"message": "Hello World"} ) trigger_2 = TriggerDagRunOperator( task_id="dag_2", trigger_dag_id="dag-to-trigger", # Ensure this equals the dag_id of the DAG to trigger conf={"message": "Hello World"} ) some_other_task = DummyOperator( task_id='some-other-task' ) end = DummyOperator( task_id='end' )
for instance_fleet in response['InstanceFleets']: if instance_fleet["InstanceFleetType"] == "TASK": return instance_fleet["Id"] def dagrun_trigger(context, dag_run_obj): cluster_id = context['ti'].xcom_pull('create_cluster', key='return_value') instance_fleet_id = get_spot_instance_fleet_id(cluster_id) _date = datetime.strptime(context['ds'], '%Y-%m-%d') + timedelta(days=1) date = _date.strftime('%Y-%m-%d') pdate = _date.strftime('%Y/%m/%d') logging.info('cluster_id, date, pdate', cluster_id, date, pdate) dag_run_obj.payload = { 'cluster_id': cluster_id, 'instance_fleet_id': instance_fleet_id, 'date': date, 'pdate': pdate } return dag_run_obj trigger = TriggerDagRunOperator( task_id='trigger', trigger_dag_id="subdag-id", python_callable=dagrun_trigger, sla=timedelta(minutes=20), email_on_failure=True, dag=dag ) create_cluster >> trigger
clean_bi_temp_tables = PostgresOperator( task_id='clean_bi_temp_tables', postgres_conn_id=f"SurfRiderDb_{env}_manager_user", sql='clean_bi_temp_tables.sql', dag=dag) logs_status_pipeline = PostgresOperator( task_id='logs_status_pipeline', postgres_conn_id=f"SurfRiderDb_{env}_manager_user", sql='logs_status_pipeline.sql', dag=dag) run_bi_postprocessing = TriggerDagRunOperator( task_id='run_bi_postprocessing', trigger_dag_id=f'bi-postprocessing-{env}', wait_for_completion=True, dag=dag) get_new_campaign_ids >> [ copy_campaign_table, copy_trash_table, copy_trajectory_point_table ] copy_trajectory_point_table >> compute_metrics_bi_temp_trajectory_point compute_metrics_bi_temp_trajectory_point >> compute_bi_temp_trajectory_point_river compute_bi_temp_trajectory_point_river >> compute_bi_temp_campaign_river copy_trash_table >> compute_metrics_bi_temp_trash >> compute_bi_temp_trash_river [ copy_campaign_table, compute_bi_temp_campaign_river, compute_bi_temp_trash_river
start_date=CONFIGS[DAG_ID]["start_date"], tags=["example"], ) with dag: """ main DAG: smart_sensor (looking for run file) -> trigger_external_dag (dag_id_DB_1) -> SubDAG (external_sensor -> print_logs -> remove_file -> print_finish_log | example TaskGroup) -> send_message (into Slack chanell) """ @task() def slack_send_message(): client = WebClient(token=SLACK_TOKEN) try: response = client.chat_postMessage(channel="airflowtask33", text="Hello from your app! :tada:") except SlackApiError as e: assert e.response["error"] # str like 'invalid_auth', 'channel_not_found' sens = SmartFileSensor(task_id="checking_file", filepath=TRIGGER_DIR, fs_conn_id='fs_default') task_trigger = TriggerDagRunOperator( task_id="trigger_database_update", trigger_dag_id="dag_id_DB_1", wait_for_completion=True, poke_interval=15, ) sub_dag = SubDagOperator(task_id='XCOM_sub_dag', subdag = sub_dag_processing(), default_args=DEFAULT_ARGS) task_slack = slack_send_message() sens >> task_trigger >> sub_dag >> task_slack
default_args = { 'owner': 'airflow', 'retries': 3, 'start_date': airflow.utils.dates.days_ago(2), 'retry_delay': timedelta(seconds=10), 'on_failure_callback': dag_failure_notification, 'on_success_callback': dag_success_notification } # 'start_date': datetime.datetime(2021, 09, 01) #airflow.utils.dates.days_ago(2), with DAG('dag_km_dependency_01_seq_01', default_args=default_args, tags=['km'], schedule_interval='* * * * *', catchup=False, dagrun_timeout=timedelta(minutes=90)) as dag: py_start_task = PythonOperator(task_id='py_start_task', python_callable=py_start_task, provide_context=True, op_kwargs={ 'to_email_address': to_email_address, }) trigger_dependent_dag = TriggerDagRunOperator( task_id="trigger_dependent_dag", trigger_dag_id="dag_km_dependency_01_seq_02", wait_for_completion=True) py_start_task >> trigger_dependent_dag
def create_dag(dag_name: str, agg_by: str, doc_type: str, cache_blob: str, path_avro_schema: str, path_local_avro_schemas: str, executor_cores: str, executor_memory: str, executor_instances: str, driver_memory: str, col_type_pk: str, extra_cols: str, max_registry_by_file: str, oracle_conn_id: str, table_ctrl: str, table_ctrl_col_control_var: str, table_ctrl_col_fk: str, table_ctrl_col_dt_ref: str, table_ctrl_col_dt_created: str, oracle_conn_blob: str, table_blob: str, table_blob_col_pk: str, table_blob_col_blob: str) -> airflow.models.dag.DAG: # ----------------- # DAG # ----------------- args = { 'owner': 'job', 'run_as_user': '******', 'start_date': datetime(2021, 8, 17), 'do_xcom_push': False, 'depends_on_past': True, 'retries': 10, 'retry_delay': timedelta(seconds=60), 'dag_name': dag_name } with DAG(dag_id=f'{step}_{dag_name}', description=f'Import data from {dag_name}', schedule_interval='00 19 * * *', catchup=False, default_args=args) as dag: dag.doc_md = __doc__ dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)""" layer = 'raw' env = Variable.get('env', default_var='dev') control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}" last_control_var = Variable.get(f'{dag_name}_last_control_var', default_var='000000000000000') current_dag_name = dag_name + '_' + control_var total_pg = int( Variable.get(f'{current_dag_name}_total_pg', default_var=1)) list_all_dates = eval( Variable.get(f'{dag_name}_list_all_dates', default_var='[]')) list_current_dates = eval( Variable.get(f'{current_dag_name}_current_dates', default_var='[]')) list_dags = eval( Variable.get(f'{dag_name}_list_dags', default_var='[]')) total_rows = Variable.get('total_rows', default_var='100000') items_by_query = 1000 sql_id = f''' SELECT {table_ctrl_col_fk} id, {table_ctrl_col_control_var} control_var, to_char({table_ctrl_col_dt_ref}, 'DD-MM-YYYY') dt_ref FROM {table_ctrl} WHERE {table_ctrl_col_control_var} > :control_var AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY') ORDER BY {table_ctrl_col_control_var} ASC FETCH FIRST :total_rows ROWS ONLY''' dict_bind_sql_get_data = { 'control_var': f'{control_var}', 'total_rows': f'{total_rows}' } sql_count_id = f''' SELECT COUNT({table_ctrl_col_fk}) FROM {table_ctrl} WHERE {table_ctrl_col_control_var} > :control_var AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')''' dict_bind_sql_count_id = {'control_var': f'{control_var}'} # ----------------- # TASKS # ----------------- task_start = PythonOperator(task_id='start', python_callable=start_time, depends_on_past=False, op_kwargs={ 'dag_name': dag_name, 'execution_date': '{{ ts }}' }) task_oracle_execute_count = OracleGetResults( task_id='oracle_execute_count', current_dag_name=current_dag_name, oracle_conn_id=oracle_conn_id, sql_count_id=sql_count_id, dict_bind=dict_bind_sql_count_id) task_check_if_contains_data_in_oracle = BranchPythonOperator( task_id='check_if_contains_data_in_oracle', python_callable=AirflowMetaStoreHelper( ).check_if_contains_data_in_oracle, op_kwargs={ 'control_var': control_var, 'last_control_var': last_control_var, 'current_dag_name': current_dag_name, 'redis_conn_id': cache_blob, 'redis_key': f'{dag_name}_original', 'true_case': 'get_id', 'false_case': 'check_len_list_processed_dates' }) task_get_id = OracleToRedisTransfer( task_id='get_id', oracle_conn_id=oracle_conn_id, redis_conn_id=cache_blob, sql=sql_id, dict_bind=dict_bind_sql_get_data, name_redis_key=f'{dag_name}_original') task_fill_data_gap = PythonOperator( task_id='fill_data_gap', python_callable=RedisHelper(cache_blob).fill_data_gaps, op_kwargs={ 'current_dag_name': current_dag_name, 'redis_conn_id': cache_blob, 'redis_key': f'{dag_name}_original' }) task_get_dag_name = PythonOperator( task_id='get_dag_name', python_callable=AirflowMetaStoreHelper().get_dag_name, op_kwargs={ 'current_dag_name': current_dag_name, 'name_list_dags': f'{dag_name}_list_dags', 'list_dags': list_dags }) task_get_date = PythonOperator( task_id='get_date', python_callable=RedisHelper(cache_blob).get_date, op_kwargs={ 'dag_name': dag_name, 'current_dag_name': current_dag_name, 'list_columns': "['id', 'control_var', 'date']", 'redis_key': current_dag_name }) task_split_id_by_date = PythonOperator( task_id='split_id_by_date', python_callable=RedisHelper(cache_blob).split_id_by_date, op_kwargs={ 'current_dag_name': current_dag_name, 'list_current_dates': list_current_dates, 'redis_key': current_dag_name }) task_generate_pagination = PythonOperator( task_id='generate_pagination', python_callable=RedisHelper(cache_blob).generate_pagination, op_kwargs={ 'current_dag_name': current_dag_name, 'items_by_query': items_by_query, 'list_current_dates': list_current_dates, 'redis_key': current_dag_name }) task_generate_sql_by_date = PythonOperator( task_id='generate_sql_by_date', python_callable=RedisHelper(cache_blob).generate_sql_by_date, op_kwargs={ 'current_dag_name': current_dag_name, 'list_current_dates': list_current_dates, 'oracle_conn': oracle_conn_blob, 'table_ctrl': table_ctrl, 'table_ctrl_col_fk': table_ctrl_col_fk, 'table_blob': table_blob, 'table_blob_col_pk': table_blob_col_pk, 'table_blob_col_blob': table_blob_col_blob, 'items_by_query': items_by_query, 'total_pg': total_pg, 'extra_cols': extra_cols, 'redis_key': current_dag_name }) task_extract_decompress_load = OracleBlobToHdfsTransfer( task_id=f'extract_decompress_load', retries=20, dag_name=dag_name, current_dag_name=current_dag_name, oracle_conn_id=oracle_conn_id, query_id=sql_id, table_ctrl_col_fk=table_ctrl_col_fk, extra_cols=extra_cols, oracle_conn_blob=oracle_conn_blob, table_blob_col_pk=table_blob_col_pk, table_blob_col_blob=table_blob_col_blob, path_avro_schema=path_avro_schema, path_local_avro_schemas= f'{path_local_avro_schemas}/{layer}/{dag_name}.avsc', total_pg=total_pg, layer=layer, env=env, step=step, executor_cores=executor_cores, executor_memory=executor_memory, executor_instances=executor_instances, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, col_type_pk=col_type_pk, compress_type='snappy', hdfs_conn_id='webhdfs', oracle_driver='oracle.jdbc.driver.OracleDriver', list_current_dates=list_current_dates) task_update_control_var = PythonOperator( task_id='update_control_var', python_callable=AirflowMetaStoreHelper().update_control_var, trigger_rule=TriggerRule.ALL_SUCCESS, depends_on_past=True, op_kwargs={ 'control_var': control_var, 'dag_name': dag_name, 'current_dag_name': current_dag_name, 'redis_conn_id': cache_blob, 'last_control_var': last_control_var, 'list_dags': list_dags, 'total_pg': total_pg, 'list_current_dates': list_current_dates, 'list_all_dates': list_all_dates, 'redis_key': current_dag_name }) task_clear_environment = PythonOperator( task_id='clear_environment', python_callable=clear_environment, trigger_rule=TriggerRule.ALL_SUCCESS, op_kwargs={ 'control_var': control_var, 'dag_name': dag_name, 'redis_conn_id': cache_blob, 'airflow_conn_id': 'airflow_db', 'last_control_var': last_control_var, 'list_dags': list_dags, 'redis_key': current_dag_name }) task_check_len_list_processed_dates = BranchPythonOperator( task_id='check_len_list_processed_dates', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=check_len_list_processed_dates, op_kwargs={ 'dag_name': dag_name, 'list_all_dates': list_all_dates, 'true_case': 'prepare_execution', 'false_case': 'waiting_execution' }) task_prepare_execution = DummyOperator(task_id='prepare_execution') with TaskGroup( group_id='group_hdfs_concat_file') as group_hdfs_concat_file: task_hdfs_prepare_concat = PythonOperator( task_id='hdfs_prepare_concat', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=HdfsPrepareConcat('webhdfs').execute, op_kwargs={ 'dag_name': dag_name, 'current_dag_name': current_dag_name, 'hdfs_path': f'/data/{env}/{layer}/{dag_name}', 'agg_by': agg_by, 'layer': layer, 'env': env, 'list_all_dates': list_all_dates, 'path_avro_tools': path_avro_tools }) # TODO: refactor -> create a task list_all_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=list_all_dates, agg_by=agg_by) for date in list_all_dates: task_concat_file = HdfsConcatFiles( task_id=f'hdfs_concat_file-{date}', retries=100, dag_name=dag_name, date=date, layer=layer, env=env, col_name_control_var=table_ctrl_col_control_var, path_avro_schema=path_avro_schema, hdfs_conn_id='webhdfs', executor_cores=executor_cores, executor_memory=executor_memory, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, format_data='avro', compress_type='snappy', max_registry_by_avro=max_registry_by_file) task_hdfs_prepare_concat >> task_concat_file task_create_partitions = PythonOperator( task_id='create_partitions', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=CreatePartitions().execute, op_kwargs={ 'dag_name': dag_name, 'current_dag_name': current_dag_name, 'list_all_dates': list_all_dates, 'hive_conn_id': 'hive', 'impala_conn_id': 'impala', 'agg_by': agg_by, 'layer': layer, 'env': env }) task_save_execution_state_hdfs = PythonOperator( task_id='save_execution_state_hdfs', python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs, op_kwargs={ 'dag_name': dag_name, 'layer': layer, 'control_var': control_var }) with TaskGroup(group_id='group_generate_statistics' ) as group_generate_statistics: # TODO: refactor -> create a task list_all_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=list_all_dates, agg_by=agg_by) for date in list_all_dates: PythonOperator(task_id=f'generate_statistics-{date}', retries=50, python_callable=GenerateStatistics().execute, op_kwargs={ 'dag_name': dag_name, 'date': date, 'layer': layer, 'impala_conn_id': 'impala', 'hive_conn_id': 'hive' }) with TaskGroup(group_id='group_check_data_quality' ) as group_check_data_quality: # TODO: refactor -> create a task list_all_dates = AirflowMetaStoreHelper().set_granularity( list_all_dates=list_all_dates, agg_by=agg_by) for date in list_all_dates: CompareDataOracleImpala( task_id=f'compare_oracle_impala_{date}', retries=100, dag_name=dag_name, last_control_var=last_control_var, layer=layer, date=date, table_ctrl=table_ctrl, dt_ref=table_ctrl_col_dt_ref, agg_by=agg_by, oracle_conn_id=oracle_conn_id, hive_conn='impala', table_ctrl_col_fk=table_ctrl_col_fk, table_ctrl_col_dt_created=table_ctrl_col_dt_created) task_check_if_contains_inconsistency = BranchPythonOperator( task_id=f'check_if_contains_inconsistency', trigger_rule=TriggerRule.ALL_SUCCESS, wait_for_downstream=True, python_callable=AirflowMetaStoreHelper( 'airflow_db').check_if_contains_inconsistency, op_kwargs={ 'dag_name': dag_name, 'last_control_var': last_control_var, 'layer': layer, 'true_case': 'prepare_reprocessing_inconsistency_data', 'false_case': f'check_next_dag', 'redis_conn_id': cache_blob, 'redis_key': f'{dag_name}_inconsistency_date' }) task_prepare_reprocessing_inconsistency_data = PrepareReprocessingInconsistencyData( task_id=f'prepare_reprocessing_inconsistency_data', trigger_rule=TriggerRule.ALL_SUCCESS, dag_name=dag_name, current_dag_name=current_dag_name, layer=layer, last_control_var=last_control_var, list_all_dates=list_all_dates, table_ctrl=table_ctrl, table_ctrl_col_fk=table_ctrl_col_fk, table_ctrl_col_control_var=table_ctrl_col_control_var, table_ctrl_col_dt_ref=table_ctrl_col_dt_ref, table_ctrl_col_dt_created=table_ctrl_col_dt_created, hive_conn_id='impala', hdfs_conn_id='webhdfs', airflow_conn_id='airflow_db', oracle_conn_id=oracle_conn_id) task_crash_dag = PythonOperator( task_id=f'crash_dag', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=crash_dag, ) task_check_next_dag = BranchPythonOperator( task_id='check_next_dag', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=check_next_dag, op_kwargs={ 'dag_name': dag_name, 'doc_type': doc_type, 'true_case': f'trigger_pre_process_{dag_name}', 'false_case': f'trigger_parser_{dag_name}' }) task_trigger_pre_process = TriggerDagRunOperator( task_id=f'trigger_pre_process_{dag_name}', trigger_dag_id=f"pre_process_{dag_name}") task_trigger_parser = TriggerDagRunOperator( task_id=f'trigger_parser_{dag_name}', trigger_dag_id=f"parser_{dag_name}") task_trigger_import_file = TriggerDagRunOperator( task_id=f'trigger_import_file_{dag_name}', trigger_dag_id=dag.dag_id) task_waiting_execution = DummyOperator( trigger_rule=TriggerRule.ALL_DONE, task_id='waiting_execution') task_end = PythonOperator(task_id='end', python_callable=end_time, op_kwargs={ 'current_dag_name': current_dag_name, 'dag_name': dag_name, 'last_control_var_name': f'{dag_name}_last_control_var', 'list_dates': f'{current_dag_name}_list_dates', 'postgres_conn_id': 'airflow_db' }) # ----------------- # GRAPH # ----------------- # task_check_if_contains_data_in_oracle: true task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_get_id >> task_fill_data_gap >> [ task_get_date, task_get_dag_name ] >> task_split_id_by_date >> task_generate_pagination >> task_generate_sql_by_date >> task_extract_decompress_load >> task_update_control_var >> [ task_clear_environment, task_trigger_import_file ] >> task_waiting_execution >> task_end # task_check_if_contains_data_in_oracle: false # task_check_len_list_processed_dates: true task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [ group_hdfs_concat_file, task_save_execution_state_hdfs ] >> task_create_partitions >> [ group_check_data_quality, group_generate_statistics ] >> task_check_if_contains_inconsistency task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [ group_hdfs_concat_file, task_save_execution_state_hdfs ] >> task_create_partitions >> task_check_if_contains_inconsistency >> task_prepare_reprocessing_inconsistency_data >> task_crash_dag # task_check_next_dag: true task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_pre_process >> task_waiting_execution >> task_end # task_check_next_dag: false task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_parser >> task_waiting_execution >> task_end # task_check_if_contains_data_in_oracle: false # task_check_len_list_processed_dates: false task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_waiting_execution >> task_end return dag
from airflow import DAG from airflow.operators.python import PythonOperator from airflow.operators.trigger_dagrun import TriggerDagRunOperator from datetime import datetime default_args = {'start_date': datetime(2021, 1, 1)} def _downloading(): print('downloading') with DAG('trigger_dag', schedule_interval='@daily', default_args=default_args, catchup=False) as dag: downloading = PythonOperator(task_id='downloading', python_callable=_downloading) trigger_target = TriggerDagRunOperator(task_id='trigger_target', trigger_dag_id='target_dag', execution_date='{{ ds }}', reset_dag_run=True) downloading >> trigger_target
def create_dag(dag_name: str, max_registry_by_file: str, db_name: str, table_name: str, col_control_var: str, col_name_dt_ref: str, oracle_conn_id: str, path_avro_schema: str, path_config_docs: str, executor_cores: str, executor_memory: str, executor_instances: str, driver_memory: str) -> airflow.models.dag.DAG: # ----------------- # DAG # ----------------- args = { 'owner': 'job', 'run_as_user': '******', 'start_date': datetime(2021, 8, 1), 'do_xcom_push': False, 'depends_on_past': True, 'retries': 10, 'retry_delay': timedelta(seconds=90), 'dag_name': dag_name } with DAG( dag_id=f'{step}_{dag_name}', description=f'Importa os dados de {dag_name}', # schedule_interval=None, schedule_interval='00 19 * * *', catchup=False, default_args=args) as dag: dag.doc_md = __doc__ dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)""" layer = 'raw' env = Variable.get('env', default_var='prod') last_control_var = Variable.get(f'{dag_name}_last_control_var', default_var='000000000000000') control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}" current_dag_name = dag_name + '_' + control_var total_rows = Variable.get('total_rows', default_var='100000') sql_get_data = f''' SELECT * FROM {db_name}.{table_name} WHERE {col_control_var} > :control_var AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY') ORDER BY {col_control_var} ASC FETCH FIRST :total_rows ROWS ONLY''' dict_bind_sql_get_data = { 'control_var': control_var, 'total_rows': total_rows } sql_count_id = f''' SELECT COUNT({col_control_var}) FROM {db_name}.{table_name} WHERE {col_control_var} > :control_var AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY') < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')''' dict_bind_sql_count_id = {'control_var': control_var} # ----------------- # TASKS # ----------------- task_start = PythonOperator(task_id='start', python_callable=start_time, depends_on_past=False, op_kwargs={ 'dag_name': dag_name, 'execution_date': '{{ ts }}' }) task_oracle_execute_count = OracleGetResults( task_id='oracle_execute_count', current_dag_name=current_dag_name, oracle_conn_id=oracle_conn_id, sql_count_id=sql_count_id, dict_bind=dict_bind_sql_count_id) task_check_if_contains_data_in_oracle = BranchPythonOperator( task_id='check_if_contains_data_in_oracle', python_callable=AirflowMetaStoreHelper( ).check_if_contains_data_in_oracle, op_kwargs={ 'control_var': control_var, 'last_control_var': last_control_var, 'current_dag_name': current_dag_name, 'true_case': 'extract_transform_load', 'false_case': 'sync_data' }) task_extract_transform_load = OracleTableToHdfsTransfer( task_id=f'extract_transform_load', retries=20, dag_name=dag_name, current_dag_name=current_dag_name, sql_get_data=sql_get_data, dict_bind=dict_bind_sql_get_data, col_control_var=col_control_var, path_avro_schema=path_avro_schema, layer=layer, env=env, step=step, executor_cores=executor_cores, executor_memory=executor_memory, executor_instances=executor_instances, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, compress_type='snappy', oracle_conn_id=oracle_conn_id, hdfs_conn_id='webhdfs', oracle_driver='oracle.jdbc.driver.OracleDriver', max_registry_by_file=max_registry_by_file) task_sync_data = SyncData( task_id='sync_data', dag_name=dag_name, db_name=db_name, table_name=table_name, col_name_control_var=col_control_var, col_name_dt_ref=col_name_dt_ref, path_avro_schema=path_avro_schema, layer=layer, env=env, hdfs_conn_id='webhdfs', oracle_conn_id=oracle_conn_id, oracle_driver='oracle.jdbc.driver.OracleDriver', executor_cores=executor_cores, executor_memory=executor_memory, executor_instances=executor_instances, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, compress_type='snappy', max_registry_by_file=max_registry_by_file) task_concat_file = HdfsConcatFiles( task_id=f'hdfs_concat_file', retries=100, dag_name=dag_name, layer=layer, env=env, col_name_control_var=col_control_var, path_avro_schema=path_avro_schema, hdfs_conn_id='webhdfs', executor_cores=executor_cores, executor_memory=executor_memory, driver_memory=driver_memory, path_ojdbc=path_ojdbc, path_spark_avro=path_spark_avro, path_native_lib=path_native_lib, format_data='parquet', compress_type='snappy', max_registry_by_avro=max_registry_by_file) task_save_execution_state_hdfs = PythonOperator( task_id='save_execution_state_hdfs', python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs, op_kwargs={ 'dag_name': dag_name, 'control_var': control_var, 'layer': layer }) task_update_statistics = UpdateStasByTable( task_id=f'update_statistics', retries=50, db_name=dag_name, table_name=layer, impala_conn_id='impala') task_clear_airflow_var = PythonOperator( task_id='clear_airflow_var', trigger_rule=TriggerRule.ALL_SUCCESS, python_callable=clear_airflow_var, op_kwargs={ 'airflow_conn_id': 'airflow_db', 'list_pattern_name_var': f"[" f" '{dag_name}_%_total_row_id'," f" '{dag_name}_raw_total_registry_%'," f" '{dag_name}_list_dags'," f" '{dag_name}_list_path_to_concat'," f" '{dag_name}_list_path_to_rename'," f" '{dag_name}_list_all_dates'" f"]" }) task_update_control_var = OracleUpdateControlVar( task_id='update_control_var', trigger_rule=TriggerRule.ALL_SUCCESS, depends_on_past=True, control_var=control_var, last_control_var=last_control_var, dag_name=dag_name, current_dag_name=current_dag_name, col_control_var=col_control_var, oracle_conn_id=oracle_conn_id, dict_bind=dict_bind_sql_get_data, sql=sql_get_data) task_trigger_import_file = TriggerDagRunOperator( task_id=f'trigger_import_file_{dag_name}', trigger_dag_id=dag.dag_id) task_end = PythonOperator(task_id='end', trigger_rule=TriggerRule.ALL_DONE, python_callable=end_time, op_kwargs={ 'current_dag_name': current_dag_name, 'dag_name': dag_name, 'last_control_var_name': f'{dag_name}_last_control_var', 'postgres_conn_id': 'airflow_db' }) # ----------------- # GRAPH # ----------------- task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_extract_transform_load >> task_update_control_var >> task_trigger_import_file >> task_end task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_sync_data >> [ task_concat_file, task_save_execution_state_hdfs ] >> task_update_statistics >> task_clear_airflow_var >> task_end return dag
def _wait_for_supermarket(supermarket_id_): supermarket_path = Path("/data/" + supermarket_id_) data_files = supermarket_path.glob("data-*.csv") success_file = supermarket_path / "_SUCCESS" return data_files and success_file.exists() for supermarket_id in range(1, 5): wait = PythonSensor( task_id=f"wait_for_supermarket_{supermarket_id}", python_callable=_wait_for_supermarket, op_kwargs={"supermarket_id_": f"supermarket{supermarket_id}"}, dag=dag1, ) copy = DummyOperator(task_id=f"copy_to_raw_supermarket_{supermarket_id}", dag=dag1) process = DummyOperator(task_id=f"process_supermarket_{supermarket_id}", dag=dag1) trigger_create_metrics_dag = TriggerDagRunOperator( task_id=f"trigger_create_metrics_dag_supermarket_{supermarket_id}", trigger_dag_id="listing_6_04_dag02", dag=dag1, ) wait >> copy >> process >> trigger_create_metrics_dag compute_differences = DummyOperator(task_id="compute_differences", dag=dag2) update_dashboard = DummyOperator(task_id="update_dashboard", dag=dag2) notify_new_data = DummyOperator(task_id="notify_new_data", dag=dag2) compute_differences >> update_dashboard
# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ Example usage of the TriggerDagRunOperator. This example holds 2 DAGs: 1. 1st DAG (example_trigger_controller_dag) holds a TriggerDagRunOperator, which will trigger the 2nd DAG 2. 2nd DAG (example_trigger_target_dag) which will be triggered by the TriggerDagRunOperator in the 1st DAG """ from datetime import datetime from airflow import DAG from airflow.operators.trigger_dagrun import TriggerDagRunOperator with DAG( dag_id="example_trigger_controller_dag", start_date=datetime(2021, 1, 1), catchup=False, schedule_interval="@once", tags=['example'], ) as dag: trigger = TriggerDagRunOperator( task_id="test_trigger_dagrun", trigger_dag_id="example_trigger_target_dag", # Ensure this equals the dag_id of the DAG to trigger conf={"message": "Hello World"}, )
proxy_user='******', args=[ 'HOME=/user', 'USER=root', 'WF_HOME=/user/root', 'ROOT_WORKFLOW_ID=aws_test_2_729508210', 'CURRENT_WORKFLOW_ID=aws_test_2_729508210', 'SLIDE_SIZE=400', 'FETCH_SIZE=1000', 'PARTITION_NUM=1', 'FAIL_THRESHOLD=1000', 'DEBUG=true', 'MASTER=' ], num_executors=1, conf={ 'spark.shuffle.compress': 'false', }, class_name='ru.neoflex.meta.etl2.spark.aws_test_2Job', polling_interval=5, on_failure_callback=on_failure_callback) join >> transformation err4 = KillOperator(task_id='err4', message='transformation or workflow failed', trigger_rule=TriggerRule.ONE_FAILED) transformation >> err4 workflow = TriggerDagRunOperator(task_id='wf_aws_test', trigger_dag_id='wf_aws_test', wait_for_completion=True, poke_interval=5, dag=dag, on_failure_callback=on_failure_callback) transformation >> workflow workflow >> err4 finish = FinishOperator(task_id='finish', ) workflow >> finish
create_timestamp = BashOperator( task_id='create_timestamp', bash_command='touch ~/timestamp_{{ ts_nodash }}', ) task_sensor >> print_results >> remove_file >> create_timestamp return dag # creates a dag with DAG(dag_id='trigger_run', start_date=datetime(2021, 1, 26), schedule_interval='@once') as dag: # checks if a file exists check_for_file = SmartFileSensor(task_id='check_for_file', filepath=path) # triggers another dag trigger_dag = TriggerDagRunOperator(task_id='trigger_dag', trigger_dag_id=external_dag, execution_date='{{ execution_date }}') # creates a sub_dag that processes results process_results = SubDagOperator(task_id='process_results_dag', subdag=create_sub_dag( dag.dag_id, 'process_results_dag', start_date=datetime(2021, 1, 26), schedule_interval='@once')), # sends a slack message alert_slack = PythonOperator(task_id='alert_slack', python_callable=slack_message) check_for_file >> trigger_dag >> process_results >> alert_slack globals()[dag.dag_id] = dag
aws_test2 = LivyOperator( task_id='aws_test2', dag=dag, livy_conn_id='livy_default', file= 's3a://datagram/user/root/deployments/autogenerated_tr_aws_test_2/ru.neoflex.meta.etl2.spark.aws_test_2-1.0' '-SNAPSHOT.jar', proxy_user='******', args=[ 'HOME=/user', 'USER=root', 'WF_HOME=/user/root', 'ROOT_WORKFLOW_ID=aws_test_2_729508210', 'CURRENT_WORKFLOW_ID=aws_test_2_729508210', 'SLIDE_SIZE=400', 'FETCH_SIZE=1000', 'PARTITION_NUM=1', 'FAIL_THRESHOLD=1000', 'DEBUG=true', 'MASTER=' ], num_executors=1, conf={ 'spark.shuffle.compress': 'false', }, class_name='ru.neoflex.meta.etl2.spark.aws_test_2Job', polling_interval=5, ) wf_aws_test = TriggerDagRunOperator(task_id='wf_aws_test', trigger_dag_id='wf_aws_test', wait_for_completion=True, poke_interval=5, dag=dag) aws_test2 >> wf_aws_test