Exemplo n.º 1
0
    def test_trigger_dagrun_twice(self):
        """Test TriggerDagRunOperator with custom execution_date."""
        utc_now = timezone.utcnow()
        task = TriggerDagRunOperator(
            task_id="test_trigger_dagrun_with_execution_date",
            trigger_dag_id=TRIGGERED_DAG_ID,
            execution_date=utc_now,
            dag=self.dag,
            poke_interval=1,
            reset_dag_run=True,
            wait_for_completion=True,
        )
        run_id = f"manual__{utc_now.isoformat()}"
        with create_session() as session:
            dag_run = DagRun(
                dag_id=TRIGGERED_DAG_ID,
                execution_date=utc_now,
                state=State.SUCCESS,
                run_type="manual",
                run_id=run_id,
            )
            session.add(dag_run)
            session.commit()
            task.execute(None)

            dagruns = session.query(DagRun).filter(
                DagRun.dag_id == TRIGGERED_DAG_ID).all()
            self.assertEqual(len(dagruns), 1)
            self.assertTrue(dagruns[0].external_trigger)
            self.assertEqual(dagruns[0].execution_date, utc_now)
Exemplo n.º 2
0
    def test_trigger_dagrun(self):
        """Test TriggerDagRunOperator."""
        task = TriggerDagRunOperator(task_id="test_task",
                                     trigger_dag_id=TRIGGERED_DAG_ID,
                                     dag=self.dag)
        task.run(start_date=DEFAULT_DATE,
                 end_date=DEFAULT_DATE,
                 ignore_ti_state=True)

        with create_session() as session:
            dagruns = session.query(DagRun).filter(
                DagRun.dag_id == TRIGGERED_DAG_ID).all()
            self.assertEqual(len(dagruns), 1)
            self.assertTrue(dagruns[0].external_trigger)
Exemplo n.º 3
0
 def test_trigger_dagrun_with_wait_for_completion_true_fail(self):
     """Test TriggerDagRunOperator with wait_for_completion but triggered dag fails."""
     execution_date = DEFAULT_DATE
     task = TriggerDagRunOperator(
         task_id="test_task",
         trigger_dag_id=TRIGGERED_DAG_ID,
         execution_date=execution_date,
         wait_for_completion=True,
         poke_interval=10,
         failed_states=[State.RUNNING],
         dag=self.dag,
     )
     with self.assertRaises(AirflowException):
         task.run(start_date=execution_date, end_date=execution_date)
Exemplo n.º 4
0
    def test_trigger_dagrun_operator_templated_conf(self):
        """Test passing a templated conf to the triggered DagRun."""
        task = TriggerDagRunOperator(
            task_id="test_trigger_dagrun_with_str_execution_date",
            trigger_dag_id=TRIGGERED_DAG_ID,
            conf={"foo": "{{ dag.dag_id }}"},
            dag=self.dag,
        )
        task.run(start_date=DEFAULT_DATE,
                 end_date=DEFAULT_DATE,
                 ignore_ti_state=True)

        with create_session() as session:
            dagruns = session.query(DagRun).filter(
                DagRun.dag_id == TRIGGERED_DAG_ID).all()
            self.assertEqual(len(dagruns), 1)
            self.assertTrue(dagruns[0].conf, {"foo": TEST_DAG_ID})
Exemplo n.º 5
0
def create_retrigger_operator(dag, task_id=None):
    if not task_id:
        task_id = f'retrigger_{dag.dag_id}'
    return TriggerDagRunOperator(
        task_id=task_id,
        trigger_dag_id=dag.dag_id,
        dag=dag
    )
Exemplo n.º 6
0
    def test_trigger_dagrun_with_templated_execution_date(self):
        """Test TriggerDagRunOperator with templated execution_date."""
        task = TriggerDagRunOperator(
            task_id="test_trigger_dagrun_with_str_execution_date",
            trigger_dag_id=TRIGGERED_DAG_ID,
            execution_date="{{ execution_date }}",
            dag=self.dag,
        )
        task.run(start_date=DEFAULT_DATE,
                 end_date=DEFAULT_DATE,
                 ignore_ti_state=True)

        with create_session() as session:
            dagruns = session.query(DagRun).filter(
                DagRun.dag_id == TRIGGERED_DAG_ID).all()
            self.assertEqual(len(dagruns), 1)
            self.assertTrue(dagruns[0].external_trigger)
            self.assertEqual(dagruns[0].execution_date, DEFAULT_DATE)
Exemplo n.º 7
0
    def test_trigger_dagrun_with_wait_for_completion_true(self):
        """Test TriggerDagRunOperator with wait_for_completion."""
        execution_date = DEFAULT_DATE
        task = TriggerDagRunOperator(
            task_id="test_task",
            trigger_dag_id=TRIGGERED_DAG_ID,
            execution_date=execution_date,
            wait_for_completion=True,
            poke_interval=10,
            allowed_states=[State.RUNNING],
            dag=self.dag,
        )
        task.run(start_date=execution_date, end_date=execution_date)

        with create_session() as session:
            dagruns = session.query(DagRun).filter(
                DagRun.dag_id == TRIGGERED_DAG_ID).all()
            self.assertEqual(len(dagruns), 1)
Exemplo n.º 8
0
    def test_trigger_dagrun_with_execution_date(self):
        """Test TriggerDagRunOperator with custom execution_date."""
        utc_now = timezone.utcnow()
        task = TriggerDagRunOperator(
            task_id="test_trigger_dagrun_with_execution_date",
            trigger_dag_id=TRIGGERED_DAG_ID,
            execution_date=utc_now,
            dag=self.dag,
        )
        task.run(start_date=DEFAULT_DATE,
                 end_date=DEFAULT_DATE,
                 ignore_ti_state=True)

        with create_session() as session:
            dagruns = session.query(DagRun).filter(
                DagRun.dag_id == TRIGGERED_DAG_ID).all()
            self.assertEqual(len(dagruns), 1)
            self.assertTrue(dagruns[0].external_trigger)
            self.assertEqual(dagruns[0].execution_date, utc_now)
Exemplo n.º 9
0
    def test_trigger_dagrun_operator_templated_invalid_conf(self):
        """Test passing a conf that is not JSON Serializable raise error."""

        with pytest.raises(
                AirflowException,
                match="^conf parameter should be JSON Serializable$"):
            TriggerDagRunOperator(
                task_id="test_trigger_dagrun_with_invalid_conf",
                trigger_dag_id=TRIGGERED_DAG_ID,
                conf={
                    "foo": "{{ dag.dag_id }}",
                    "datetime": timezone.utcnow()
                },
                dag=self.dag,
            )
Exemplo n.º 10
0
    def test_trigger_dagrun_with_reset_dag_run_false(self):
        """Test TriggerDagRunOperator with reset_dag_run."""
        execution_date = DEFAULT_DATE
        task = TriggerDagRunOperator(
            task_id="test_task",
            trigger_dag_id=TRIGGERED_DAG_ID,
            execution_date=execution_date,
            reset_dag_run=False,
            dag=self.dag,
        )
        task.run(start_date=execution_date,
                 end_date=execution_date,
                 ignore_ti_state=True)

        with self.assertRaises(DagRunAlreadyExists):
            task.run(start_date=execution_date,
                     end_date=execution_date,
                     ignore_ti_state=True)
Exemplo n.º 11
0
    def test_trigger_dagrun_with_reset_dag_run_true(self):
        """Test TriggerDagRunOperator with reset_dag_run."""
        execution_date = DEFAULT_DATE
        task = TriggerDagRunOperator(
            task_id="test_task",
            trigger_dag_id=TRIGGERED_DAG_ID,
            execution_date=execution_date,
            reset_dag_run=True,
            dag=self.dag,
        )
        task.run(start_date=execution_date,
                 end_date=execution_date,
                 ignore_ti_state=True)
        task.run(start_date=execution_date,
                 end_date=execution_date,
                 ignore_ti_state=True)

        with create_session() as session:
            dagruns = session.query(DagRun).filter(
                DagRun.dag_id == TRIGGERED_DAG_ID).all()
            assert len(dagruns) == 1
            assert dagruns[0].external_trigger
# ================================================ EXAMPLE 1 =================================================

example_1_dag_1 = DAG(
    dag_id="figure_6_17_example_1_dag_1",
    start_date=airflow.utils.dates.days_ago(3),
    schedule_interval="0 0 * * *",
)
example_1_dag_2 = DAG(
    dag_id="figure_6_17_example_1_dag_2",
    start_date=airflow.utils.dates.days_ago(3),
    schedule_interval=None,
)

DummyOperator(task_id="etl", dag=example_1_dag_1) >> TriggerDagRunOperator(
    task_id="trigger_dag2",
    trigger_dag_id="figure_6_17_example_1_dag_2",
    dag=example_1_dag_1,
)
PythonOperator(task_id="report",
               dag=example_1_dag_2,
               python_callable=lambda: print("hello"))

# ================================================ EXAMPLE 2 =================================================

example_2_dag_1 = DAG(
    dag_id="figure_6_17_example_2_dag_1",
    start_date=airflow.utils.dates.days_ago(3),
    schedule_interval="0 0 * * *",
)
example_2_dag_2 = DAG(
    dag_id="figure_6_17_example_2_dag_2",
Exemplo n.º 13
0
from airflow.utils.dates import days_ago


with DAG(
    dag_id="controller_dag_to_trigger_other_dags",
    default_args={"owner": "airflow"},
    start_date=days_ago(1),
    schedule_interval="@once",
) as dag:
    start = DummyOperator(
        task_id='start'
    )

    trigger_1 = TriggerDagRunOperator(
        task_id="dag_1",
        trigger_dag_id="dag-to-trigger",  # Ensure this equals the dag_id of the DAG to trigger
        conf={"message": "Hello World"}
    )
    trigger_2 = TriggerDagRunOperator(
        task_id="dag_2",
        trigger_dag_id="dag-to-trigger",  # Ensure this equals the dag_id of the DAG to trigger
        conf={"message": "Hello World"}
    )

    some_other_task = DummyOperator(
        task_id='some-other-task'
    )

    end = DummyOperator(
        task_id='end'
    )
Exemplo n.º 14
0
    for instance_fleet in response['InstanceFleets']:
        if instance_fleet["InstanceFleetType"] == "TASK":
            return instance_fleet["Id"]


def dagrun_trigger(context, dag_run_obj):
    cluster_id = context['ti'].xcom_pull('create_cluster', key='return_value')
    instance_fleet_id = get_spot_instance_fleet_id(cluster_id)
    _date = datetime.strptime(context['ds'], '%Y-%m-%d') + timedelta(days=1)
    date = _date.strftime('%Y-%m-%d')
    pdate = _date.strftime('%Y/%m/%d')
    logging.info('cluster_id, date, pdate', cluster_id, date, pdate)
    dag_run_obj.payload = {
        'cluster_id': cluster_id,
        'instance_fleet_id': instance_fleet_id,
        'date': date,
        'pdate': pdate
    }
    return dag_run_obj


trigger = TriggerDagRunOperator(
    task_id='trigger',
    trigger_dag_id="subdag-id",
    python_callable=dagrun_trigger,
    sla=timedelta(minutes=20),
    email_on_failure=True,
    dag=dag
)

create_cluster >> trigger
Exemplo n.º 15
0
    clean_bi_temp_tables = PostgresOperator(
        task_id='clean_bi_temp_tables',
        postgres_conn_id=f"SurfRiderDb_{env}_manager_user",
        sql='clean_bi_temp_tables.sql',
        dag=dag)

    logs_status_pipeline = PostgresOperator(
        task_id='logs_status_pipeline',
        postgres_conn_id=f"SurfRiderDb_{env}_manager_user",
        sql='logs_status_pipeline.sql',
        dag=dag)

    run_bi_postprocessing = TriggerDagRunOperator(
        task_id='run_bi_postprocessing',
        trigger_dag_id=f'bi-postprocessing-{env}',
        wait_for_completion=True,
        dag=dag)

get_new_campaign_ids >> [
    copy_campaign_table, copy_trash_table, copy_trajectory_point_table
]
copy_trajectory_point_table >> compute_metrics_bi_temp_trajectory_point
compute_metrics_bi_temp_trajectory_point >> compute_bi_temp_trajectory_point_river
compute_bi_temp_trajectory_point_river >> compute_bi_temp_campaign_river

copy_trash_table >> compute_metrics_bi_temp_trash >> compute_bi_temp_trash_river

[
    copy_campaign_table, compute_bi_temp_campaign_river,
    compute_bi_temp_trash_river
Exemplo n.º 16
0
        start_date=CONFIGS[DAG_ID]["start_date"],
        tags=["example"],
    )

with dag:
    """ main DAG:
    smart_sensor (looking for run file) ->
    trigger_external_dag (dag_id_DB_1) -> 
    SubDAG (external_sensor -> print_logs -> remove_file -> print_finish_log | example TaskGroup) -> 
    send_message (into Slack chanell)
    """
    @task()
    def slack_send_message():
        client = WebClient(token=SLACK_TOKEN)
        try:
            response = client.chat_postMessage(channel="airflowtask33", text="Hello from your app! :tada:")
        except SlackApiError as e:
            assert e.response["error"]  # str like 'invalid_auth', 'channel_not_found'


    sens = SmartFileSensor(task_id="checking_file", filepath=TRIGGER_DIR, fs_conn_id='fs_default')

    task_trigger = TriggerDagRunOperator(
        task_id="trigger_database_update", trigger_dag_id="dag_id_DB_1", wait_for_completion=True, poke_interval=15,
    )

    sub_dag = SubDagOperator(task_id='XCOM_sub_dag', subdag = sub_dag_processing(), default_args=DEFAULT_ARGS)

    task_slack = slack_send_message()

    sens >> task_trigger >> sub_dag >> task_slack
Exemplo n.º 17
0

default_args = {
    'owner': 'airflow',
    'retries': 3,
    'start_date': airflow.utils.dates.days_ago(2),
    'retry_delay': timedelta(seconds=10),
    'on_failure_callback': dag_failure_notification,
    'on_success_callback': dag_success_notification
}
#    'start_date': datetime.datetime(2021, 09, 01) #airflow.utils.dates.days_ago(2),

with DAG('dag_km_dependency_01_seq_01',
         default_args=default_args,
         tags=['km'],
         schedule_interval='* * * * *',
         catchup=False,
         dagrun_timeout=timedelta(minutes=90)) as dag:

    py_start_task = PythonOperator(task_id='py_start_task',
                                   python_callable=py_start_task,
                                   provide_context=True,
                                   op_kwargs={
                                       'to_email_address': to_email_address,
                                   })

    trigger_dependent_dag = TriggerDagRunOperator(
        task_id="trigger_dependent_dag",
        trigger_dag_id="dag_km_dependency_01_seq_02",
        wait_for_completion=True)
    py_start_task >> trigger_dependent_dag
Exemplo n.º 18
0
def create_dag(dag_name: str, agg_by: str, doc_type: str, cache_blob: str,
               path_avro_schema: str, path_local_avro_schemas: str,
               executor_cores: str, executor_memory: str,
               executor_instances: str, driver_memory: str, col_type_pk: str,
               extra_cols: str, max_registry_by_file: str, oracle_conn_id: str,
               table_ctrl: str, table_ctrl_col_control_var: str,
               table_ctrl_col_fk: str, table_ctrl_col_dt_ref: str,
               table_ctrl_col_dt_created: str, oracle_conn_blob: str,
               table_blob: str, table_blob_col_pk: str,
               table_blob_col_blob: str) -> airflow.models.dag.DAG:
    # -----------------
    #        DAG
    # -----------------
    args = {
        'owner': 'job',
        'run_as_user': '******',
        'start_date': datetime(2021, 8, 17),
        'do_xcom_push': False,
        'depends_on_past': True,
        'retries': 10,
        'retry_delay': timedelta(seconds=60),
        'dag_name': dag_name
    }

    with DAG(dag_id=f'{step}_{dag_name}',
             description=f'Import data from {dag_name}',
             schedule_interval='00 19 * * *',
             catchup=False,
             default_args=args) as dag:
        dag.doc_md = __doc__
        dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)"""

        layer = 'raw'
        env = Variable.get('env', default_var='dev')
        control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}"
        last_control_var = Variable.get(f'{dag_name}_last_control_var',
                                        default_var='000000000000000')
        current_dag_name = dag_name + '_' + control_var
        total_pg = int(
            Variable.get(f'{current_dag_name}_total_pg', default_var=1))
        list_all_dates = eval(
            Variable.get(f'{dag_name}_list_all_dates', default_var='[]'))
        list_current_dates = eval(
            Variable.get(f'{current_dag_name}_current_dates',
                         default_var='[]'))
        list_dags = eval(
            Variable.get(f'{dag_name}_list_dags', default_var='[]'))
        total_rows = Variable.get('total_rows', default_var='100000')
        items_by_query = 1000

        sql_id = f'''
        SELECT
            {table_ctrl_col_fk} id,
            {table_ctrl_col_control_var} control_var,
            to_char({table_ctrl_col_dt_ref}, 'DD-MM-YYYY') dt_ref
        FROM {table_ctrl}
        WHERE
           {table_ctrl_col_control_var} > :control_var
           AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {table_ctrl_col_control_var} ASC
        FETCH FIRST :total_rows ROWS ONLY'''
        dict_bind_sql_get_data = {
            'control_var': f'{control_var}',
            'total_rows': f'{total_rows}'
        }

        sql_count_id = f'''
        SELECT COUNT({table_ctrl_col_fk})
        FROM {table_ctrl}
        WHERE
           {table_ctrl_col_control_var} > :control_var
           AND TO_DATE(to_char({table_ctrl_col_dt_created}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')'''
        dict_bind_sql_count_id = {'control_var': f'{control_var}'}

        # -----------------
        #      TASKS
        # -----------------
        task_start = PythonOperator(task_id='start',
                                    python_callable=start_time,
                                    depends_on_past=False,
                                    op_kwargs={
                                        'dag_name': dag_name,
                                        'execution_date': '{{ ts }}'
                                    })

        task_oracle_execute_count = OracleGetResults(
            task_id='oracle_execute_count',
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            sql_count_id=sql_count_id,
            dict_bind=dict_bind_sql_count_id)

        task_check_if_contains_data_in_oracle = BranchPythonOperator(
            task_id='check_if_contains_data_in_oracle',
            python_callable=AirflowMetaStoreHelper(
            ).check_if_contains_data_in_oracle,
            op_kwargs={
                'control_var': control_var,
                'last_control_var': last_control_var,
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_original',
                'true_case': 'get_id',
                'false_case': 'check_len_list_processed_dates'
            })

        task_get_id = OracleToRedisTransfer(
            task_id='get_id',
            oracle_conn_id=oracle_conn_id,
            redis_conn_id=cache_blob,
            sql=sql_id,
            dict_bind=dict_bind_sql_get_data,
            name_redis_key=f'{dag_name}_original')

        task_fill_data_gap = PythonOperator(
            task_id='fill_data_gap',
            python_callable=RedisHelper(cache_blob).fill_data_gaps,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_original'
            })

        task_get_dag_name = PythonOperator(
            task_id='get_dag_name',
            python_callable=AirflowMetaStoreHelper().get_dag_name,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'name_list_dags': f'{dag_name}_list_dags',
                'list_dags': list_dags
            })

        task_get_date = PythonOperator(
            task_id='get_date',
            python_callable=RedisHelper(cache_blob).get_date,
            op_kwargs={
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'list_columns': "['id', 'control_var', 'date']",
                'redis_key': current_dag_name
            })

        task_split_id_by_date = PythonOperator(
            task_id='split_id_by_date',
            python_callable=RedisHelper(cache_blob).split_id_by_date,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'list_current_dates': list_current_dates,
                'redis_key': current_dag_name
            })

        task_generate_pagination = PythonOperator(
            task_id='generate_pagination',
            python_callable=RedisHelper(cache_blob).generate_pagination,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'items_by_query': items_by_query,
                'list_current_dates': list_current_dates,
                'redis_key': current_dag_name
            })

        task_generate_sql_by_date = PythonOperator(
            task_id='generate_sql_by_date',
            python_callable=RedisHelper(cache_blob).generate_sql_by_date,
            op_kwargs={
                'current_dag_name': current_dag_name,
                'list_current_dates': list_current_dates,
                'oracle_conn': oracle_conn_blob,
                'table_ctrl': table_ctrl,
                'table_ctrl_col_fk': table_ctrl_col_fk,
                'table_blob': table_blob,
                'table_blob_col_pk': table_blob_col_pk,
                'table_blob_col_blob': table_blob_col_blob,
                'items_by_query': items_by_query,
                'total_pg': total_pg,
                'extra_cols': extra_cols,
                'redis_key': current_dag_name
            })

        task_extract_decompress_load = OracleBlobToHdfsTransfer(
            task_id=f'extract_decompress_load',
            retries=20,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            query_id=sql_id,
            table_ctrl_col_fk=table_ctrl_col_fk,
            extra_cols=extra_cols,
            oracle_conn_blob=oracle_conn_blob,
            table_blob_col_pk=table_blob_col_pk,
            table_blob_col_blob=table_blob_col_blob,
            path_avro_schema=path_avro_schema,
            path_local_avro_schemas=
            f'{path_local_avro_schemas}/{layer}/{dag_name}.avsc',
            total_pg=total_pg,
            layer=layer,
            env=env,
            step=step,
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            col_type_pk=col_type_pk,
            compress_type='snappy',
            hdfs_conn_id='webhdfs',
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            list_current_dates=list_current_dates)

        task_update_control_var = PythonOperator(
            task_id='update_control_var',
            python_callable=AirflowMetaStoreHelper().update_control_var,
            trigger_rule=TriggerRule.ALL_SUCCESS,
            depends_on_past=True,
            op_kwargs={
                'control_var': control_var,
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'redis_conn_id': cache_blob,
                'last_control_var': last_control_var,
                'list_dags': list_dags,
                'total_pg': total_pg,
                'list_current_dates': list_current_dates,
                'list_all_dates': list_all_dates,
                'redis_key': current_dag_name
            })

        task_clear_environment = PythonOperator(
            task_id='clear_environment',
            python_callable=clear_environment,
            trigger_rule=TriggerRule.ALL_SUCCESS,
            op_kwargs={
                'control_var': control_var,
                'dag_name': dag_name,
                'redis_conn_id': cache_blob,
                'airflow_conn_id': 'airflow_db',
                'last_control_var': last_control_var,
                'list_dags': list_dags,
                'redis_key': current_dag_name
            })

        task_check_len_list_processed_dates = BranchPythonOperator(
            task_id='check_len_list_processed_dates',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=check_len_list_processed_dates,
            op_kwargs={
                'dag_name': dag_name,
                'list_all_dates': list_all_dates,
                'true_case': 'prepare_execution',
                'false_case': 'waiting_execution'
            })

        task_prepare_execution = DummyOperator(task_id='prepare_execution')

        with TaskGroup(
                group_id='group_hdfs_concat_file') as group_hdfs_concat_file:
            task_hdfs_prepare_concat = PythonOperator(
                task_id='hdfs_prepare_concat',
                trigger_rule=TriggerRule.ALL_SUCCESS,
                python_callable=HdfsPrepareConcat('webhdfs').execute,
                op_kwargs={
                    'dag_name': dag_name,
                    'current_dag_name': current_dag_name,
                    'hdfs_path': f'/data/{env}/{layer}/{dag_name}',
                    'agg_by': agg_by,
                    'layer': layer,
                    'env': env,
                    'list_all_dates': list_all_dates,
                    'path_avro_tools': path_avro_tools
                })

            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)
            for date in list_all_dates:
                task_concat_file = HdfsConcatFiles(
                    task_id=f'hdfs_concat_file-{date}',
                    retries=100,
                    dag_name=dag_name,
                    date=date,
                    layer=layer,
                    env=env,
                    col_name_control_var=table_ctrl_col_control_var,
                    path_avro_schema=path_avro_schema,
                    hdfs_conn_id='webhdfs',
                    executor_cores=executor_cores,
                    executor_memory=executor_memory,
                    driver_memory=driver_memory,
                    path_ojdbc=path_ojdbc,
                    path_spark_avro=path_spark_avro,
                    path_native_lib=path_native_lib,
                    format_data='avro',
                    compress_type='snappy',
                    max_registry_by_avro=max_registry_by_file)
                task_hdfs_prepare_concat >> task_concat_file

        task_create_partitions = PythonOperator(
            task_id='create_partitions',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=CreatePartitions().execute,
            op_kwargs={
                'dag_name': dag_name,
                'current_dag_name': current_dag_name,
                'list_all_dates': list_all_dates,
                'hive_conn_id': 'hive',
                'impala_conn_id': 'impala',
                'agg_by': agg_by,
                'layer': layer,
                'env': env
            })

        task_save_execution_state_hdfs = PythonOperator(
            task_id='save_execution_state_hdfs',
            python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs,
            op_kwargs={
                'dag_name': dag_name,
                'layer': layer,
                'control_var': control_var
            })

        with TaskGroup(group_id='group_generate_statistics'
                       ) as group_generate_statistics:
            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)

            for date in list_all_dates:
                PythonOperator(task_id=f'generate_statistics-{date}',
                               retries=50,
                               python_callable=GenerateStatistics().execute,
                               op_kwargs={
                                   'dag_name': dag_name,
                                   'date': date,
                                   'layer': layer,
                                   'impala_conn_id': 'impala',
                                   'hive_conn_id': 'hive'
                               })

        with TaskGroup(group_id='group_check_data_quality'
                       ) as group_check_data_quality:
            # TODO: refactor -> create a task
            list_all_dates = AirflowMetaStoreHelper().set_granularity(
                list_all_dates=list_all_dates, agg_by=agg_by)

            for date in list_all_dates:
                CompareDataOracleImpala(
                    task_id=f'compare_oracle_impala_{date}',
                    retries=100,
                    dag_name=dag_name,
                    last_control_var=last_control_var,
                    layer=layer,
                    date=date,
                    table_ctrl=table_ctrl,
                    dt_ref=table_ctrl_col_dt_ref,
                    agg_by=agg_by,
                    oracle_conn_id=oracle_conn_id,
                    hive_conn='impala',
                    table_ctrl_col_fk=table_ctrl_col_fk,
                    table_ctrl_col_dt_created=table_ctrl_col_dt_created)

        task_check_if_contains_inconsistency = BranchPythonOperator(
            task_id=f'check_if_contains_inconsistency',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            wait_for_downstream=True,
            python_callable=AirflowMetaStoreHelper(
                'airflow_db').check_if_contains_inconsistency,
            op_kwargs={
                'dag_name': dag_name,
                'last_control_var': last_control_var,
                'layer': layer,
                'true_case': 'prepare_reprocessing_inconsistency_data',
                'false_case': f'check_next_dag',
                'redis_conn_id': cache_blob,
                'redis_key': f'{dag_name}_inconsistency_date'
            })

        task_prepare_reprocessing_inconsistency_data = PrepareReprocessingInconsistencyData(
            task_id=f'prepare_reprocessing_inconsistency_data',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            layer=layer,
            last_control_var=last_control_var,
            list_all_dates=list_all_dates,
            table_ctrl=table_ctrl,
            table_ctrl_col_fk=table_ctrl_col_fk,
            table_ctrl_col_control_var=table_ctrl_col_control_var,
            table_ctrl_col_dt_ref=table_ctrl_col_dt_ref,
            table_ctrl_col_dt_created=table_ctrl_col_dt_created,
            hive_conn_id='impala',
            hdfs_conn_id='webhdfs',
            airflow_conn_id='airflow_db',
            oracle_conn_id=oracle_conn_id)

        task_crash_dag = PythonOperator(
            task_id=f'crash_dag',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=crash_dag,
        )

        task_check_next_dag = BranchPythonOperator(
            task_id='check_next_dag',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=check_next_dag,
            op_kwargs={
                'dag_name': dag_name,
                'doc_type': doc_type,
                'true_case': f'trigger_pre_process_{dag_name}',
                'false_case': f'trigger_parser_{dag_name}'
            })

        task_trigger_pre_process = TriggerDagRunOperator(
            task_id=f'trigger_pre_process_{dag_name}',
            trigger_dag_id=f"pre_process_{dag_name}")

        task_trigger_parser = TriggerDagRunOperator(
            task_id=f'trigger_parser_{dag_name}',
            trigger_dag_id=f"parser_{dag_name}")

        task_trigger_import_file = TriggerDagRunOperator(
            task_id=f'trigger_import_file_{dag_name}',
            trigger_dag_id=dag.dag_id)

        task_waiting_execution = DummyOperator(
            trigger_rule=TriggerRule.ALL_DONE, task_id='waiting_execution')

        task_end = PythonOperator(task_id='end',
                                  python_callable=end_time,
                                  op_kwargs={
                                      'current_dag_name': current_dag_name,
                                      'dag_name': dag_name,
                                      'last_control_var_name':
                                      f'{dag_name}_last_control_var',
                                      'list_dates':
                                      f'{current_dag_name}_list_dates',
                                      'postgres_conn_id': 'airflow_db'
                                  })

    # -----------------
    #      GRAPH
    # -----------------
    # task_check_if_contains_data_in_oracle: true
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_get_id >> task_fill_data_gap >> [
        task_get_date, task_get_dag_name
    ] >> task_split_id_by_date >> task_generate_pagination >> task_generate_sql_by_date >> task_extract_decompress_load >> task_update_control_var >> [
        task_clear_environment, task_trigger_import_file
    ] >> task_waiting_execution >> task_end

    # task_check_if_contains_data_in_oracle: false
    #   task_check_len_list_processed_dates: true
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [
        group_hdfs_concat_file, task_save_execution_state_hdfs
    ] >> task_create_partitions >> [
        group_check_data_quality, group_generate_statistics
    ] >> task_check_if_contains_inconsistency
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_prepare_execution >> [
        group_hdfs_concat_file, task_save_execution_state_hdfs
    ] >> task_create_partitions >> task_check_if_contains_inconsistency >> task_prepare_reprocessing_inconsistency_data >> task_crash_dag

    # task_check_next_dag: true
    task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_pre_process >> task_waiting_execution >> task_end
    # task_check_next_dag: false
    task_check_if_contains_inconsistency >> task_check_next_dag >> task_trigger_parser >> task_waiting_execution >> task_end

    # task_check_if_contains_data_in_oracle: false
    #   task_check_len_list_processed_dates: false
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_check_len_list_processed_dates >> task_waiting_execution >> task_end

    return dag
Exemplo n.º 19
0
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.operators.trigger_dagrun import TriggerDagRunOperator

from datetime import datetime

default_args = {'start_date': datetime(2021, 1, 1)}


def _downloading():
    print('downloading')


with DAG('trigger_dag',
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    downloading = PythonOperator(task_id='downloading',
                                 python_callable=_downloading)

    trigger_target = TriggerDagRunOperator(task_id='trigger_target',
                                           trigger_dag_id='target_dag',
                                           execution_date='{{ ds }}',
                                           reset_dag_run=True)

    downloading >> trigger_target
Exemplo n.º 20
0
def create_dag(dag_name: str, max_registry_by_file: str, db_name: str,
               table_name: str, col_control_var: str, col_name_dt_ref: str,
               oracle_conn_id: str, path_avro_schema: str,
               path_config_docs: str, executor_cores: str,
               executor_memory: str, executor_instances: str,
               driver_memory: str) -> airflow.models.dag.DAG:
    # -----------------
    #        DAG
    # -----------------
    args = {
        'owner': 'job',
        'run_as_user': '******',
        'start_date': datetime(2021, 8, 1),
        'do_xcom_push': False,
        'depends_on_past': True,
        'retries': 10,
        'retry_delay': timedelta(seconds=90),
        'dag_name': dag_name
    }

    with DAG(
            dag_id=f'{step}_{dag_name}',
            description=f'Importa os dados de {dag_name}',
            # schedule_interval=None,
            schedule_interval='00 19 * * *',
            catchup=False,
            default_args=args) as dag:
        dag.doc_md = __doc__
        dag.doc_md = """![image alt <](../big_data.wiki/.attachments/xpto_company.png)"""
        layer = 'raw'
        env = Variable.get('env', default_var='prod')
        last_control_var = Variable.get(f'{dag_name}_last_control_var',
                                        default_var='000000000000000')
        control_var = f"{int(Variable.get(f'{dag_name}_control_var', default_var='000000000000000')):015d}"
        current_dag_name = dag_name + '_' + control_var
        total_rows = Variable.get('total_rows', default_var='100000')

        sql_get_data = f'''
        SELECT
            *
        FROM {db_name}.{table_name}
        WHERE
            {col_control_var} > :control_var
            AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')
        ORDER BY {col_control_var} ASC
        FETCH FIRST :total_rows ROWS ONLY'''
        dict_bind_sql_get_data = {
            'control_var': control_var,
            'total_rows': total_rows
        }

        sql_count_id = f'''
        SELECT COUNT({col_control_var})
        FROM {db_name}.{table_name}
        WHERE
            {col_control_var} > :control_var
            AND TO_DATE(to_char({col_name_dt_ref}, 'DD-MM-YYYY'), 'DD-MM-YYYY')
                < TO_DATE(to_char(trunc(sysdate), 'DD-MM-YYYY'), 'DD-MM-YYYY')'''
        dict_bind_sql_count_id = {'control_var': control_var}

        # -----------------
        #      TASKS
        # -----------------
        task_start = PythonOperator(task_id='start',
                                    python_callable=start_time,
                                    depends_on_past=False,
                                    op_kwargs={
                                        'dag_name': dag_name,
                                        'execution_date': '{{ ts }}'
                                    })

        task_oracle_execute_count = OracleGetResults(
            task_id='oracle_execute_count',
            current_dag_name=current_dag_name,
            oracle_conn_id=oracle_conn_id,
            sql_count_id=sql_count_id,
            dict_bind=dict_bind_sql_count_id)

        task_check_if_contains_data_in_oracle = BranchPythonOperator(
            task_id='check_if_contains_data_in_oracle',
            python_callable=AirflowMetaStoreHelper(
            ).check_if_contains_data_in_oracle,
            op_kwargs={
                'control_var': control_var,
                'last_control_var': last_control_var,
                'current_dag_name': current_dag_name,
                'true_case': 'extract_transform_load',
                'false_case': 'sync_data'
            })

        task_extract_transform_load = OracleTableToHdfsTransfer(
            task_id=f'extract_transform_load',
            retries=20,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            sql_get_data=sql_get_data,
            dict_bind=dict_bind_sql_get_data,
            col_control_var=col_control_var,
            path_avro_schema=path_avro_schema,
            layer=layer,
            env=env,
            step=step,
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            compress_type='snappy',
            oracle_conn_id=oracle_conn_id,
            hdfs_conn_id='webhdfs',
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            max_registry_by_file=max_registry_by_file)

        task_sync_data = SyncData(
            task_id='sync_data',
            dag_name=dag_name,
            db_name=db_name,
            table_name=table_name,
            col_name_control_var=col_control_var,
            col_name_dt_ref=col_name_dt_ref,
            path_avro_schema=path_avro_schema,
            layer=layer,
            env=env,
            hdfs_conn_id='webhdfs',
            oracle_conn_id=oracle_conn_id,
            oracle_driver='oracle.jdbc.driver.OracleDriver',
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            executor_instances=executor_instances,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            compress_type='snappy',
            max_registry_by_file=max_registry_by_file)

        task_concat_file = HdfsConcatFiles(
            task_id=f'hdfs_concat_file',
            retries=100,
            dag_name=dag_name,
            layer=layer,
            env=env,
            col_name_control_var=col_control_var,
            path_avro_schema=path_avro_schema,
            hdfs_conn_id='webhdfs',
            executor_cores=executor_cores,
            executor_memory=executor_memory,
            driver_memory=driver_memory,
            path_ojdbc=path_ojdbc,
            path_spark_avro=path_spark_avro,
            path_native_lib=path_native_lib,
            format_data='parquet',
            compress_type='snappy',
            max_registry_by_avro=max_registry_by_file)

        task_save_execution_state_hdfs = PythonOperator(
            task_id='save_execution_state_hdfs',
            python_callable=HdfsHelper('webhdfs').save_execution_state_hdfs,
            op_kwargs={
                'dag_name': dag_name,
                'control_var': control_var,
                'layer': layer
            })

        task_update_statistics = UpdateStasByTable(
            task_id=f'update_statistics',
            retries=50,
            db_name=dag_name,
            table_name=layer,
            impala_conn_id='impala')

        task_clear_airflow_var = PythonOperator(
            task_id='clear_airflow_var',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            python_callable=clear_airflow_var,
            op_kwargs={
                'airflow_conn_id':
                'airflow_db',
                'list_pattern_name_var':
                f"["
                f" '{dag_name}_%_total_row_id',"
                f" '{dag_name}_raw_total_registry_%',"
                f" '{dag_name}_list_dags',"
                f" '{dag_name}_list_path_to_concat',"
                f" '{dag_name}_list_path_to_rename',"
                f" '{dag_name}_list_all_dates'"
                f"]"
            })

        task_update_control_var = OracleUpdateControlVar(
            task_id='update_control_var',
            trigger_rule=TriggerRule.ALL_SUCCESS,
            depends_on_past=True,
            control_var=control_var,
            last_control_var=last_control_var,
            dag_name=dag_name,
            current_dag_name=current_dag_name,
            col_control_var=col_control_var,
            oracle_conn_id=oracle_conn_id,
            dict_bind=dict_bind_sql_get_data,
            sql=sql_get_data)

        task_trigger_import_file = TriggerDagRunOperator(
            task_id=f'trigger_import_file_{dag_name}',
            trigger_dag_id=dag.dag_id)

        task_end = PythonOperator(task_id='end',
                                  trigger_rule=TriggerRule.ALL_DONE,
                                  python_callable=end_time,
                                  op_kwargs={
                                      'current_dag_name': current_dag_name,
                                      'dag_name': dag_name,
                                      'last_control_var_name':
                                      f'{dag_name}_last_control_var',
                                      'postgres_conn_id': 'airflow_db'
                                  })

    # -----------------
    #      GRAPH
    # -----------------
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_extract_transform_load >> task_update_control_var >> task_trigger_import_file >> task_end
    task_start >> task_oracle_execute_count >> task_check_if_contains_data_in_oracle >> task_sync_data >> [
        task_concat_file, task_save_execution_state_hdfs
    ] >> task_update_statistics >> task_clear_airflow_var >> task_end

    return dag
def _wait_for_supermarket(supermarket_id_):
    supermarket_path = Path("/data/" + supermarket_id_)
    data_files = supermarket_path.glob("data-*.csv")
    success_file = supermarket_path / "_SUCCESS"
    return data_files and success_file.exists()


for supermarket_id in range(1, 5):
    wait = PythonSensor(
        task_id=f"wait_for_supermarket_{supermarket_id}",
        python_callable=_wait_for_supermarket,
        op_kwargs={"supermarket_id_": f"supermarket{supermarket_id}"},
        dag=dag1,
    )
    copy = DummyOperator(task_id=f"copy_to_raw_supermarket_{supermarket_id}",
                         dag=dag1)
    process = DummyOperator(task_id=f"process_supermarket_{supermarket_id}",
                            dag=dag1)
    trigger_create_metrics_dag = TriggerDagRunOperator(
        task_id=f"trigger_create_metrics_dag_supermarket_{supermarket_id}",
        trigger_dag_id="listing_6_04_dag02",
        dag=dag1,
    )
    wait >> copy >> process >> trigger_create_metrics_dag

compute_differences = DummyOperator(task_id="compute_differences", dag=dag2)
update_dashboard = DummyOperator(task_id="update_dashboard", dag=dag2)
notify_new_data = DummyOperator(task_id="notify_new_data", dag=dag2)
compute_differences >> update_dashboard
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

"""
Example usage of the TriggerDagRunOperator. This example holds 2 DAGs:
1. 1st DAG (example_trigger_controller_dag) holds a TriggerDagRunOperator, which will trigger the 2nd DAG
2. 2nd DAG (example_trigger_target_dag) which will be triggered by the TriggerDagRunOperator in the 1st DAG
"""
from datetime import datetime

from airflow import DAG
from airflow.operators.trigger_dagrun import TriggerDagRunOperator

with DAG(
    dag_id="example_trigger_controller_dag",
    start_date=datetime(2021, 1, 1),
    catchup=False,
    schedule_interval="@once",
    tags=['example'],
) as dag:
    trigger = TriggerDagRunOperator(
        task_id="test_trigger_dagrun",
        trigger_dag_id="example_trigger_target_dag",  # Ensure this equals the dag_id of the DAG to trigger
        conf={"message": "Hello World"},
    )
Exemplo n.º 23
0
        proxy_user='******',
        args=[
            'HOME=/user', 'USER=root', 'WF_HOME=/user/root',
            'ROOT_WORKFLOW_ID=aws_test_2_729508210',
            'CURRENT_WORKFLOW_ID=aws_test_2_729508210', 'SLIDE_SIZE=400',
            'FETCH_SIZE=1000', 'PARTITION_NUM=1', 'FAIL_THRESHOLD=1000',
            'DEBUG=true', 'MASTER='
        ],
        num_executors=1,
        conf={
            'spark.shuffle.compress': 'false',
        },
        class_name='ru.neoflex.meta.etl2.spark.aws_test_2Job',
        polling_interval=5,
        on_failure_callback=on_failure_callback)
    join >> transformation
    err4 = KillOperator(task_id='err4',
                        message='transformation or workflow failed',
                        trigger_rule=TriggerRule.ONE_FAILED)
    transformation >> err4
    workflow = TriggerDagRunOperator(task_id='wf_aws_test',
                                     trigger_dag_id='wf_aws_test',
                                     wait_for_completion=True,
                                     poke_interval=5,
                                     dag=dag,
                                     on_failure_callback=on_failure_callback)
    transformation >> workflow
    workflow >> err4
    finish = FinishOperator(task_id='finish', )
    workflow >> finish
Exemplo n.º 24
0
        create_timestamp = BashOperator(
            task_id='create_timestamp',
            bash_command='touch ~/timestamp_{{ ts_nodash }}',
        )
        task_sensor >> print_results >> remove_file >> create_timestamp
    return dag


# creates a dag
with DAG(dag_id='trigger_run',
         start_date=datetime(2021, 1, 26),
         schedule_interval='@once') as dag:
    # checks if a file exists
    check_for_file = SmartFileSensor(task_id='check_for_file', filepath=path)
    # triggers another dag
    trigger_dag = TriggerDagRunOperator(task_id='trigger_dag',
                                        trigger_dag_id=external_dag,
                                        execution_date='{{ execution_date }}')
    # creates a sub_dag that processes results
    process_results = SubDagOperator(task_id='process_results_dag',
                                     subdag=create_sub_dag(
                                         dag.dag_id,
                                         'process_results_dag',
                                         start_date=datetime(2021, 1, 26),
                                         schedule_interval='@once')),
    # sends a slack  message
    alert_slack = PythonOperator(task_id='alert_slack',
                                 python_callable=slack_message)
    check_for_file >> trigger_dag >> process_results >> alert_slack
    globals()[dag.dag_id] = dag
Exemplo n.º 25
0
    aws_test2 = LivyOperator(
        task_id='aws_test2',
        dag=dag,
        livy_conn_id='livy_default',
        file=
        's3a://datagram/user/root/deployments/autogenerated_tr_aws_test_2/ru.neoflex.meta.etl2.spark.aws_test_2-1.0'
        '-SNAPSHOT.jar',
        proxy_user='******',
        args=[
            'HOME=/user', 'USER=root', 'WF_HOME=/user/root',
            'ROOT_WORKFLOW_ID=aws_test_2_729508210',
            'CURRENT_WORKFLOW_ID=aws_test_2_729508210', 'SLIDE_SIZE=400',
            'FETCH_SIZE=1000', 'PARTITION_NUM=1', 'FAIL_THRESHOLD=1000',
            'DEBUG=true', 'MASTER='
        ],
        num_executors=1,
        conf={
            'spark.shuffle.compress': 'false',
        },
        class_name='ru.neoflex.meta.etl2.spark.aws_test_2Job',
        polling_interval=5,
    )

    wf_aws_test = TriggerDagRunOperator(task_id='wf_aws_test',
                                        trigger_dag_id='wf_aws_test',
                                        wait_for_completion=True,
                                        poke_interval=5,
                                        dag=dag)
    aws_test2 >> wf_aws_test