Пример #1
0
def create_dag(client):
    dag = DAG(DAG_ID,
              default_args=default_args,
              schedule_interval=None)
    with dag:
        start_task = DummyOperator(task_id='start')
        finish_task = DummyOperator(task_id='finish')

        for table in airflow_vars['dags']['vibe_to_lake']['tables']:
            pusher_task_id = f'schedule_dataflow_{table}'
            schedule_df_task = ScheduleDataflowJobOperator(
                task_id=pusher_task_id,
                project=project,
                template_name='load_vibe_to_lake',
                job_name=f'vibe-to-lake---client---{table}',
                job_parameters={
                    'client': '--client--',
                    'table': f'`{project}.pyr_--client--_{env}.{table}`',
                    'dest': f'{project}:lake.{table}'
                },
                provide_context=True
            )
            monitor_df_job_task = DataflowJobStateSensor(
                task_id=f'monitor_df_job_{table}',
                pusher_task_id=pusher_task_id,
                poke_interval=airflow_vars['dags']['vibe_to_lake']['poke_interval'],
                timeout=airflow_vars['dags']['vibe_to_lake']['poke_timeout'],
                dag=dag
            )
            start_task.set_downstream(schedule_df_task)
            schedule_df_task.set_downstream(monitor_df_job_task)
            monitor_df_job_task.set_downstream(finish_task)

        start_task >> finish_task
    return dag
Пример #2
0
def create_dag(client):
    dag = DAG(DAG_ID, default_args=default_args, schedule_interval=None)
    with dag:
        tables = []
        with open(f'{settings.DAGS_FOLDER}/table_lists/table-list.json', 'r') as f:
            file_json_content = f.read()
            tables = json.loads(file_json_content)

        should_run_task = BranchPythonOperator(
            task_id='should_run',
            python_callable=should_run
        )
        start_sql_instance_task = BashOperator(
            task_id='start_sql_instance',
            bash_command=start_sql_cmd
        )
        pre_delete_database_task = BashOperator(
            task_id=f'pre_delete_{database_name}_database',
            bash_command=delete_db_cmd
        )
        create_db_task = BashOperator(
            task_id=f'create_{database_name}_database',
            bash_command=create_db_cmd
        )
        import_db_task = BashOperator(
            task_id=f'import_{database_name}_database',
            bash_command=import_db_cmd
        )
        delete_db_import_file_task = PythonOperator(
            task_id='delete_db_import_file',
            python_callable=delete_db_import_file
        )
        post_delete_database_task = BashOperator(
            task_id=f'post_delete_{database_name}_database',
            bash_command=delete_db_cmd
        )
        stop_sql_instance_task = BashOperator(
            task_id='stop_sql_instance',
            bash_command=stop_sql_cmd
        )
        finish_task = DummyOperator(
            task_id='finish'
        )

        try:
            for t in tables:
                pusher_task_id = f'schedule_dataflow_job_for_{t["table"]}'
                schedule_df_task = ScheduleDataflowJobOperator(
                    task_id=pusher_task_id,
                    project=project_id,
                    template_name='load_sql_to_bq',
                    job_name=f'load---client---{t["table"]}-sql-to-bq',
                    job_parameters={
                        'env': env,
                        'client': '--client--',
                        'bq_table': f'{project_id}:{database_name}.{t["table"]}',
                        'table': t["table"],
                        'key_field': t["keyField"]
                    },
                    provide_context=True
                )
                monitor_df_job_task = DataflowJobStateSensor(
                    task_id=f'monitor_df_job_{t["table"]}',
                    pusher_task_id=pusher_task_id,
                    poke_interval=airflow_vars['dags']['vibe_to_bq_initial_load']['poke_interval'],
                    timeout=airflow_vars['dags']['vibe_to_bq_initial_load']['poke_timeout']
                )
                import_db_task.set_downstream(schedule_df_task)
                schedule_df_task.set_downstream(monitor_df_job_task)
                monitor_df_job_task.set_downstream(delete_db_import_file_task)
        except Exception as e:
            log.error(e)

        (
            should_run_task
            >> start_sql_instance_task
            >> pre_delete_database_task
            >> create_db_task
            >> import_db_task
            >> delete_db_import_file_task
            >> post_delete_database_task
            >> stop_sql_instance_task
            >> finish_task
        )

    return dag
Пример #3
0
def create_dag():
    dag = DAG(
        DAG_ID,
        default_args=default_args,
        # Be sure to stagger the dags so they don't run all at once,
        # possibly causing max memory usage and pod failure. - Stu M.
        schedule_interval='30 * * * *',
        catchup=False)
    with dag:
        start_task = DummyOperator(task_id='start')
        finish_task = DummyOperator(task_id='finish')

        for table, sources in table_map.items():
            pusher_task_id = f'schedule_dataflow_{table}'
            parsed_table = gcloud.parse_table_name(table)

            get_checkpoint_task = GetCheckpointOperator(
                task_id=f'get_checkpoint_{table}',
                env=env,
                target=table,
                sources=sources)

            continue_if_data_task = BranchPythonOperator(
                task_id=f'continue_if_data_{table}',
                python_callable=should_continue,
                op_args=[table],
                provide_context=True)

            parse_query_task = PythonOperator(task_id=f'parse_query_{table}',
                                              python_callable=parse_query,
                                              op_args=[table],
                                              provide_context=True)

            dataflow_task = ScheduleDataflowJobOperator(
                task_id=pusher_task_id,
                project=gcloud.project(env),
                template_name=f'load_lake_to_staging_{parsed_table}',
                job_name=f'lake-to-staging-{table}',
                job_parameters={'env': env},
                pull_parameters=[{
                    'param_name': 'query',
                    'task_id': f'parse_query_{table}'
                }],
                provide_context=True)

            monitor_dataflow_task = DataflowJobStateSensor(
                task_id=f'monitor_df_job_{table}',
                poke_interval=airflow_vars['dags']['lake_to_staging']
                ['poke_interval'],
                timeout=airflow_vars['dags']['lake_to_staging']
                ['poke_timeout'],
                dag=dag,
                pusher_task_id=pusher_task_id)

            set_checkpoint_task = SetCheckpointOperator(
                task_id=f'set_checkpoint_{table}', env=env, table=table)

            start_task.set_downstream(get_checkpoint_task)
            get_checkpoint_task.set_downstream(continue_if_data_task)
            continue_if_data_task.set_downstream(parse_query_task)
            parse_query_task.set_downstream(dataflow_task)
            dataflow_task.set_downstream(monitor_dataflow_task)
            monitor_dataflow_task.set_downstream(set_checkpoint_task)
            set_checkpoint_task.set_downstream(finish_task)

        start_task >> finish_task
    return dag