示例#1
0
    def test_subdag_pools(self):
        """
        Subdags and subdag tasks can't both have a pool with 1 slot
        """
        dag = DAG('parent', default_args=default_args)
        subdag = DAG('parent.child', default_args=default_args)

        session = airflow.settings.Session()
        pool_1 = airflow.models.Pool(pool='test_pool_1', slots=1)
        pool_10 = airflow.models.Pool(pool='test_pool_10', slots=10)
        session.add(pool_1)
        session.add(pool_10)
        session.commit()

        dummy_1 = DummyOperator(task_id='dummy',
                                dag=subdag,
                                pool='test_pool_1')

        self.assertRaises(AirflowException,
                          SubDagOperator,
                          task_id='child',
                          dag=dag,
                          subdag=subdag,
                          pool='test_pool_1')

        # recreate dag because failed subdagoperator was already added
        dag = DAG('parent', default_args=default_args)
        SubDagOperator(task_id='child',
                       dag=dag,
                       subdag=subdag,
                       pool='test_pool_10')

        session.delete(pool_1)
        session.delete(pool_10)
        session.commit()
示例#2
0
    def test_subdag_name(self):
        """
        Subdag names must be {parent_dag}.{subdag task}
        """
        dag = DAG('parent', default_args=default_args)
        subdag_good = DAG('parent.test', default_args=default_args)
        subdag_bad1 = DAG('parent.bad', default_args=default_args)
        subdag_bad2 = DAG('bad.test', default_args=default_args)
        subdag_bad3 = DAG('bad.bad', default_args=default_args)

        SubDagOperator(task_id='test', dag=dag, subdag=subdag_good)
        self.assertRaises(AirflowException,
                          SubDagOperator,
                          task_id='test',
                          dag=dag,
                          subdag=subdag_bad1)
        self.assertRaises(AirflowException,
                          SubDagOperator,
                          task_id='test',
                          dag=dag,
                          subdag=subdag_bad2)
        self.assertRaises(AirflowException,
                          SubDagOperator,
                          task_id='test',
                          dag=dag,
                          subdag=subdag_bad3)
示例#3
0
def subdag_task(database):
    sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, database,
                                                     '@once'),
                             task_id=database,
                             dag=main_dag,
                             pool='Pool_max_parallel_500',
                             executor=LocalExecutor())
    return sub_dag
示例#4
0
    task_id='test_depends_on_past_2',
    depends_on_past=True,
    dag=dag6,
)
dag6_task2.set_upstream(dag6_task1)

# DAG tests that a deadlocked subdag is properly caught
dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args)
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args)
subdag7_task1 = PythonOperator(task_id='test_subdag_fail',
                               dag=subdag7,
                               python_callable=fail)
subdag7_task2 = DummyOperator(
    task_id='test_subdag_dummy_1',
    dag=subdag7,
)
subdag7_task3 = DummyOperator(task_id='test_subdag_dummy_2', dag=subdag7)
dag7_subdag1 = SubDagOperator(task_id='subdag', dag=dag7, subdag=subdag7)
subdag7_task1.set_downstream(subdag7_task2)
subdag7_task2.set_downstream(subdag7_task3)

# DAG tests that queued tasks are run
dag8 = DAG(dag_id='test_scheduled_queued_tasks',
           start_date=DEFAULT_DATE,
           end_date=DEFAULT_DATE,
           default_args=default_args)
dag8_task1 = PythonOperator(python_callable=fail,
                            task_id='test_queued_task',
                            dag=dag8,
                            pool='test_queued_pool')
示例#5
0
    def __new__(
        cls,
        parent_id,
        gcs_dirs_xcom,
        dst_dir,
        filename,
        schema_fields,
        table_name,
        task_id,
        dag,
    ):
        from airflow.utils.dates import days_ago

        args = {
            "start_date": days_ago(2),
        }

        bucket = get_bucket().replace("gs://", "", 1)
        full_table_name = format_table_name(table_name, is_staging=True)

        subdag = DAG(dag_id=f"{parent_id}.{task_id}", default_args=args)

        column_names = [schema["name"] for schema in schema_fields]

        # by convention, preface task names with dag_id
        op_col_select = PythonTaskflowOperator(
            task_id="select_cols",
            python_callable=_keep_columns,
            # note that this input should have form schedule/{execution_date}/...
            taskflow={
                "gcs_dirs": {
                    "dag_id": parent_id,
                    "task_ids": gcs_dirs_xcom
                }
            },
            op_kwargs={
                "dst_dir": dst_dir,
                "filename": filename,
                "required_cols": [],
                "optional_cols": column_names,
            },
            dag=subdag,
        )

        op_stage_bq = GoogleCloudStorageToBigQueryOperator(
            task_id="stage_bigquery",
            bucket=bucket,
            # note that we can't really pull a list out of xcom without subclassing
            # operators, so we really on knowing that the task passing in
            # gcs_dirs_xcom data is using schedule/{execution_date}
            source_objects=[
                "schedule/{{execution_date}}/*/%s/%s" % (dst_dir, filename)
            ],
            schema_fields=schema_fields,
            destination_project_dataset_table=full_table_name,
            create_disposition="CREATE_IF_NEEDED",
            write_disposition="WRITE_TRUNCATE",
            # _keep_columns function includes headers in output
            skip_leading_rows=1,
            dag=subdag,
        )

        op_col_select >> op_stage_bq

        return SubDagOperator(subdag=subdag, dag=dag, task_id=task_id)
        start_task >> dt_s3
        dt_s3 >> dt_sf
        dt_sf >> end

    return one_dag


#############################################################################
#Defining Main Dag structure
#############################################################################

main_dag = DAG(
    dag_id=parent_dag_name,
    default_args=default_args,
    schedule_interval='@once'
    #schedule_interval=timedelta(minutes=5),
    #max_active_runs=1
    ,
    concurrency=35)

database_list = ['database']

#Each database is an independant task that will run in parallel4
for i in database_list:
    sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, i,
                                                     '@once'),
                             task_id=i,
                             dag=main_dag,
                             pool='Pool_max_parallel_5',
                             executor=LocalExecutor())
示例#7
0
    bucket='s3://udacity-dend/song_data',
    table='staging_songs',
    queries=SqlQueries,
    json_format='auto',
    timestamped=False,
    provide_context=True)

load_songplays_table = LoadFactOperator(task_id='Load_songplays_fact_table',
                                        dag=dag,
                                        redshift_conn_id='redshift',
                                        queries=SqlQueries)

load_dimension_tables = SubDagOperator(
    task_id='Load_dimension_tables',
    subdag=load_dimension_table_subdag('final_dag', 'Load_dimension_tables',
                                       default_args,
                                       ['users', 'songs', 'artists', 'time']),
    default_args=default_args,
    dag=dag,
)

run_quality_checks = SubDagOperator(
    task_id='Run_data_quality_checks',
    subdag=run_data_quality_subdag(
        'final_dag', 'Run_data_quality_checks', default_args,
        [{
            'table': 'songplays',
            'tests': [{
                'check': 'SQL_COUNT',
                'operator': '>',
                'result': 0
            }]
示例#8
0

#############################################################################
#Defining Main Dag structure
#############################################################################

 

main_dag = DAG(
    dag_id=parent_dag_name,
    default_args=default_args,
    schedule_interval='@once'
    #schedule_interval=timedelta(minutes=5),
    #max_active_runs=1
)


database_list = get_database_list(database_include_patterns)


#Each database is an independant task that will run in parallel4
for i in database_list:
    sub_dag = SubDagOperator(
        subdag = database_sub_dag(parent_dag_name, i, '@once'),
        task_id= i,
        dag=main_dag,
    )



rescheduling loop.
"""
from datetime import datetime

from airflow.models import DAG
from airflow.operators import SubDagOperator
from airflow.example_dags.subdags.subdag import subdag

args = {
    'owner': 'airflow',
    'start_date': datetime(2016, 1, 1),
}

dag = DAG(
    dag_id='test_raise_executor_error',
    default_args=args,
    schedule_interval="@daily",
)

section_1 = SubDagOperator(
    task_id='subdag_op',
    subdag=subdag('test_raise_executor_error', 'subdag_op', args),
    default_args=args,
    dag=dag,
)

# change the subdag name -- this creates an error because the subdag
# won't be found, but it'll do it in a way that causes the executor to report
# success
section_1.subdag.dag_id = 'bad_id'
示例#10
0
def make_task_group(dagname, name, path, pdag, trigger_rule):
    args = {
        'owner':
        'deploy',
        'start_date':
        datetime.strptime(
            (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d'),
            '%Y-%m-%d')
    }
    dag = DAG(dag_id=dagname,
              default_args=args,
              schedule_interval=timedelta(hours=24))
    sdOp = SubDagOperator(task_id=name,
                          subdag=dag,
                          dag=pdag,
                          trigger_rule=trigger_rule)
    start = DummyOperator(task_id=name + '_start', dag=dag)
    end = DummyOperator(task_id=name + '_end', dag=dag)
    task_dict = {}
    task_dict[name + '_start'] = start
    task_dict[name + '_end'] = end

    config_str = open(path, 'r').read()
    config_arr = config_str.split(
        '[dependency]'
    )  # a -> b means data flow from a to b, though not valid ini format, but i think it is better to understand
    config_fp = StringIO.StringIO(config_arr[0])
    config = ConfigParser.RawConfigParser()
    config.readfp(config_fp)

    sections = config.sections()
    #print sections
    for section in sections:
        #print section
        if section != 'dependency':
            options = config.options(section)
            #print options
            if 'type' not in options or 'cmd' not in options:
                continue
            operator_type = config.get(section, 'type').strip()
            operator_cmd = config.get(section, 'cmd').strip()
            if operator_type == 'PostgresOperator':
                task = PostgresOperator(task_id=section.strip(),
                                        depends_on_past=False,
                                        postgres_conn_id='postgres_sha2dw03',
                                        sql=operator_cmd,
                                        dag=dag)
                task_dict[section.strip()] = task
                task.set_downstream(end)
                task.set_upstream(start)
            elif operator_type == 'BashOperator':
                task = BashOperator(task_id=section.strip(),
                                    depends_on_past=False,
                                    bash_command=operator_cmd,
                                    dag=dag)
                task_dict[section.strip()] = task
                task.set_downstream(end)
                task.set_upstream(start)
            else:
                print "Error: currently not support %s operator type" % operator_type

    if len(config_arr) == 1:
        return (start, end, dag, sdOp, task_dict)

    for line in config_arr[1].split('\n'):
        arr = line.split('->')
        if len(arr) != 2:
            continue
        left_side = arr[0].strip()
        right_side = arr[1].strip()
        if left_side in task_dict.keys() and right_side in task_dict.keys():
            if end in task_dict[left_side].downstream_list:
                task_dict[left_side].downstream_list.remove(end)
            task_dict[left_side].set_downstream(task_dict[right_side])
            if start in task_dict[right_side].upstream_list:
                task_dict[right_side].upstream_list.remove(start)
            if task_dict[right_side] in start.downstream_list:
                start.downstream_list.remove(task_dict[right_side])
            if task_dict[left_side] in end.upstream_list:
                end.upstream_list.remove(task_dict[left_side])
    return (start, end, dag, sdOp, task_dict)
示例#11
0
        dbt_task = BashOperator(
            task_id=model_name,
            bash_command=
            'cd ~/gospel && dbt run  --profile=warehouse --target=prod --non-destructive --models {simple_model_name}'
            .format(simple_model_name=simple_model_name),
            dag=temp_dag)
        return dbt_task

    dbt_tasks = {}
    for node_name in set(G.nodes()):
        dbt_task = make_dbt_task(node_name)
        dbt_tasks[node_name] = dbt_task

    for edge in G.edges():
        dbt_tasks[edge[0]].set_downstream(dbt_tasks[edge[1]])
    return temp_dag


dbt_sub_dag = SubDagOperator(subdag=dbt_dag(dag.start_date,
                                            dag.schedule_interval,
                                            default_args=default_args),
                             task_id='dbt_sub_dag',
                             dag=dag,
                             trigger_rule='all_done')
dbt_sub_dag.set_upstream(copy_gpickle_file)

dbt_test = BashOperator(
    task_id='dbt_test',
    bash_command='cd ~/project && dbt test  --profile=warehouse --target=prod',
    dag=dag)
dbt_test.set_upstream(dbt_sub_dag)
示例#12
0
dag = DAG(
    dag_id=DAG_NAME,
    default_args=args,
    schedule_interval="@once",
)

start = DummyOperator(
    task_id='start',
    default_args=args,
    dag=dag,
)

section_1 = SubDagOperator(
    task_id='section-1',
    subdag=subdag(DAG_NAME, 'section-1', args),
    default_args=args,
    dag=dag,
)

some_other_task = DummyOperator(
    task_id='some-other-task',
    default_args=args,
    dag=dag,
)

section_2 = SubDagOperator(
    task_id='section-2',
    subdag=subdag(DAG_NAME, 'section-2', args),
    default_args=args,
    dag=dag,
)
              schedule_interval='@once')

# define "tripDataSubdag" variable
tripDataSubdag = 'tripDataSubdag'

# NOTICE a SubDag is actually a Task, an instance of "SubDagOperator"
tripsSubdagTask = SubDagOperator(
    task_id=tripDataSubdag
    #--------------------------------------------------------------------------
    # ATTENTION!
    #   - "subdag" is a mandatory "SubDagOperator" parameter
    #   - "subdag" receives a Python Factory function as input
    #--------------------------------------------------------------------------
    ,
    subdag=redshift_table_ddl_plus_bulk_load(
        parent_dag_id=parentDagId,
        parent_task_id=tripDataSubdag,
        redshift_conn_id="redshift",
        aws_credentials_id="aws_credentials",
        table='trips',
        create_table_statement=sql_statements.CREATE_TRIPS_TABLE_SQL,
        s3_bucket='udacity-dend',
        s3_key='data-pipelines/divvy/unpartitioned/divvy_trips_2018.csv',
        start_date=startDate),
    dag=mainDag)

# define "stationDataSubdag" variable
stationDataSubdag = 'stationDataSubdag'

# Remember the "Subdag/Task Parity"
stationDataSubdagTask = SubDagOperator(
    True,
    "approved":
    True,
    "inProg":
    False,
    "done":
    False,
    "approvedBy":
    "karakuri",
    "workflow":
    workflow_id
})

print("TASKS: ", tasks)

dag = DAG('sfsc_review_new_airflow_process_tasks',
          default_args=default_args,
          schedule_interval=None)

start = DummyOperator(task_id='start', default_args=default_args, dag=dag)

process = SubDagOperator(
    task_id='process',
    subdag=subdag_tasks('sfsc_review_new_airflow_process_tasks', 'process',
                        tasks, default_args),
    default_args=default_args,
    dag=dag,
)

start >> process
示例#15
0
            "export_data_hql": export_table2_to_gcp_hql,
            "add_dataproc_partition_hql": add_dataproc_table2_partition_hql,
            "drop_tmp_table_hql": drop_tmp_table2_hql
        }
    }
}

export_table_params = {
    'gcp_warehouse_path': persist_cfg['gcp_warehouse']['casesci_prd'],
    'gcp_keyfile': persist_cfg['gcp_keyfile']['casesci_prd']
}

export_table_dataproc_config = {
    "dataproc_cluster": persist_cfg["gcp_prod_project"]["dataproc_cluster"],
    "region": persist_cfg["gcp_prod_project"]["region"],
    "gcp_conn_id": persist_cfg["gcp_prod_project"]["gcp_conn_id"]
}

for op_name in export_table_dict:
    export_table_dict[op_name]["operator"] = SubDagOperator(
        subdag=export_to_gcp_dag("crmdata_bids_bounds." + op_name,
                                 sync_data2GCP_dag.schedule_interval,
                                 persist_cfg['queue'],
                                 sync_data2GCP_dag.default_args,
                                 export_table_dict[op_name]["hql_dict"],
                                 export_table_params,
                                 export_table_dataproc_config),
        task_id=op_name,
        queue=persist_cfg['queue'],
        dag=sync_data2GCP_dag)
示例#16
0
gen_search_terms = BranchPythonOperator(task_id='generate_search_terms',
                                        provide_context=True,
                                        python_callable=generate_search_terms,
                                        dag=dag)

email_links = EmailOperator(
    task_id='email_best_links',
    to='*****@*****.**',
    subject='Latest popular links',
    html_content='Check out the latest!!',
    files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)],
    dag=dag)

sub = SubDagOperator(subdag=subdag,
                     task_id='insert_and_id_pop',
                     trigger_rule='one_success',
                     dag=dag)

clear_latest = BashOperator(
    bash_command='rm -rf {}/latest_links.txt'.format(RAW_TWEET_DIR),
    task_id='clear_latest',
    dag=dag)

gen_search_terms.set_upstream(fill_search_terms)

for term in SEARCH_TERMS:
    term_without_punctuation = re.sub(r'\W+', '', term)
    simple_search = PythonOperator(
        task_id='search_{}_twitter'.format(term_without_punctuation),
        provide_context=True,
        python_callable=search_twitter,