Exemplo n.º 1
0
    def apply_task_to_dag(self):
        check_cloudformation_stack_exists_task = BranchPythonOperator(
            templates_dict={'stack_name': self.stack_name},
            task_id=f'is-cloudformation-{self.task_id}-running',
            python_callable=self.__cloudformation_stack_running_branch,
            provide_context=True,
            dag=self.dag)

        create_cloudformation_stack_task = CloudFormationCreateStackOperator(
            task_id=f'create-cloudformation-{self.task_id}',
            params={**self.__reformatted_params()},
            dag=self.dag)

        create_stack_sensor_task = CloudFormationCreateStackSensor(
            task_id=f'cloudformation-watch-{self.task_id}-create',
            stack_name=self.stack_name,
            dag=self.dag)

        stack_creation_end_task = DummyOperator(
            task_id=f'creation-end-{self.task_id}',
            dag=self.dag,
            trigger_rule='all_done')

        if self.parent:
            self.parent.set_downstream(check_cloudformation_stack_exists_task)

        create_stack_sensor_task.set_downstream(stack_creation_end_task)
        create_cloudformation_stack_task.set_downstream(
            create_stack_sensor_task)
        check_cloudformation_stack_exists_task.set_downstream(
            create_cloudformation_stack_task)
        check_cloudformation_stack_exists_task.set_downstream(
            stack_creation_end_task)

        return stack_creation_end_task
    def apply_task_to_dag(self):
        check_dags_queued_task = BranchPythonOperator(
            task_id=f'{self.task_id}-is-dag-queue-empty',
            python_callable=self.__queued_dag_runs_exists,
            provide_context=True,
            trigger_rule=TriggerRule.ALL_DONE,
            dag=self.dag)

        delete_stack_task = CloudFormationDeleteStackOperator(
            task_id=f'delete-cloudformation-{self.task_id}',
            params={'StackName': self.stack_name},
            dag=self.dag)

        delete_stack_sensor = CloudFormationDeleteStackSensor(
            task_id=f'cloudformation-watch-{self.task_id}-delete',
            stack_name=self.stack_name,
            dag=self.dag)

        stack_delete_end_task = DummyOperator(
            task_id=f'delete-end-{self.task_id}', dag=self.dag)

        if self.parent:
            self.parent.set_downstream(check_dags_queued_task)

        check_dags_queued_task.set_downstream(stack_delete_end_task)
        check_dags_queued_task.set_downstream(delete_stack_task)
        delete_stack_task.set_downstream(delete_stack_sensor)
        delete_stack_sensor.set_downstream(stack_delete_end_task)

        return stack_delete_end_task
Exemplo n.º 3
0
            PROJECT_PATH))

    # Ref: https://airflow.apache.org/macros.html for the jinja variables used below.
    EXP_CRAWL_COMMAND = Template("""
        cd $project_path && scrapy crawl treasury_expenditures -a start={{ ds_nodash }} -a end={{ ds_nodash }}
    """)

    EXP_CRAWL_TASK = BashOperator(
        task_id='crawl_expenditure',
        bash_command=EXP_CRAWL_COMMAND.substitute(project_path=PROJECT_PATH),
        trigger_rule='none_failed')

    REC_CRAWL_COMMAND = Template("""
        cd $project_path && scrapy crawl treasury_receipts -a start={{ ds_nodash }} -a end={{ ds_nodash }}
    """)

    REC_CRAWL_TASK = BashOperator(
        task_id='crawl_receipts',
        bash_command=REC_CRAWL_COMMAND.substitute(project_path=PROJECT_PATH),
        trigger_rule='none_failed')

BRANCH_OP = BranchPythonOperator(task_id='branch_task',
                                 provide_context=True,
                                 python_callable=branch_tasks,
                                 dag=dag)

CREATE_DIR.set_downstream(BRANCH_OP)
BRANCH_OP.set_downstream([CRAWL_DDO_CODES, EXP_CRAWL_TASK, REC_CRAWL_TASK])
CRAWL_DDO_CODES.set_downstream(EXP_CRAWL_TASK)
CRAWL_DDO_CODES.set_downstream(REC_CRAWL_TASK)
# launch sagemaker hyperparameter job and wait until it completes
tune_model_task = SageMakerTuningOperator(task_id='model_tuning',
                                          dag=dag,
                                          config=tuner_config,
                                          aws_conn_id='airflow-sagemaker',
                                          wait_for_completion=True,
                                          check_interval=30)

# launch sagemaker batch transform job and wait until it completes
batch_transform_task = SageMakerTransformOperator(
    task_id='predicting',
    dag=dag,
    config=transform_config,
    aws_conn_id='airflow-sagemaker',
    wait_for_completion=True,
    check_interval=30,
    trigger_rule=TriggerRule.ONE_SUCCESS)

cleanup_task = DummyOperator(task_id='cleaning_up', dag=dag)

# set the dependencies between tasks

init.set_downstream(preprocess_task)
preprocess_task.set_downstream(prepare_task)
prepare_task.set_downstream(branching)
branching.set_downstream(tune_model_task)
branching.set_downstream(train_model_task)
tune_model_task.set_downstream(batch_transform_task)
train_model_task.set_downstream(batch_transform_task)
batch_transform_task.set_downstream(cleanup_task)
    if version_path in [v['name'] for v in model_versions]:
        logging.info("Delete previously version of the model to overwrite.")
        mle.delete_version(PROJECT, model_name, version_params['name'])

    mle.create_version(PROJECT, model_name, version_params)

# Checks if model exists using Hook instead of GCP operators due to conditional.
t10 = PythonOperator(
    task_id='list_versions', dag=dag, python_callable=do_list_versions)

# Creates model if it does not exist using Hook instead of GCP operators
t11 = PythonOperator(
    task_id='create_version', dag=dag, python_callable=do_create_version)

# Create task graph
t1.set_downstream(t2)
t2.set_downstream(t3)
t3.set_downstream(t4_train_cond)
t4_train_cond.set_downstream(t4_ml_engine)
t4_ml_engine.set_downstream([t4a, t4b, t4c])
t4_ml_engine.set_downstream(t5d)
t4a.set_downstream(t5a)
t4b.set_downstream(t5b)
t4c.set_downstream(t5c)
t6.set_upstream([t5a, t5b, t5c, t5d])
t6.set_downstream(t7)
t7.set_downstream(t8)
t9.set_upstream(t8)
t9.set_downstream(t10)
t10.set_downstream(t11)
Exemplo n.º 6
0
import calendar

default_args={
	'owner':'ankityadav',
	'depends_upon_past':False,
	'retries':1,
	'start_date':datetime(2019,1,14)
}

dag=DAG('batch_operator',default_args=default_args,schedule_interval='@once')

months=["January","February","March","April","May","June","July","August","September","October","November","December"]


def branch():
	for m in months:
		if (m == calendar.month_name[datetime.now().month]):
			return 'task_for_'+m


hello=BranchPythonOperator(
	task_id='branching',
	python_callable=branch,
	provide_context=False,
	dag=dag	
)


for month in months:
	hello.set_downstream(DummyOperator(task_id='task_for_' + month, dag=dag))
Exemplo n.º 7
0

def _print_high():
    return 'HIGH'


default_args = {
    'owner': 'pedro',
    'retries': 0,
    'start_date': datetime(2020, 4, 10)
}
with DAG('random_number_extended',
         default_args=default_args,
         schedule_interval='0 4 * * *') as dag:
    dummy_start_task = DummyOperator(task_id=f'dummy_start')
    generate_random_number = BashOperator(task_id='generate_random_number',
                                          bash_command=bash_cmd)
    # New branch operator
    read_num_and_square = BranchPythonOperator(
        task_id='read_number_and_square_it',
        python_callable=_read_number_and_square,
        op_args=[STORE_DIR],
        provide_context=True,  # pass task instance params to python callable
    )
    print_high = PythonOperator(task_id='print_high',
                                python_callable=_print_high)
    print_low = BashOperator(task_id='print_low', bash_command='echo LOW')
    # Define tasks (normal path and then each branch)
    dummy_start_task >> generate_random_number >> read_num_and_square >> print_high
    read_num_and_square.set_downstream(print_low)
Exemplo n.º 8
0

serve_branch = BranchPythonOperator(task_id="serve_or_not",
                                    python_callable=model_exist,
                                    dag=dag)

t3 = KubernetesPodOperator(
    namespace="default",
    name="{}-restapi".format(model_name.lower()),
    image="tensorflow/serving:latest",
    env_vars={
        'MODEL_NAME': model_name,
        'MODEL_BASE_PATH': '/root/runtime/models'
    },
    task_id="serve_model",
    port=8501,
    dag=dag,
    async=True,
    in_cluster=True,
    labels={'name': '{}-restapi'.format(model_name.lower())},
    volume_mounts=[volume_mount],
    volumes=[volume])

t4 = DummyOperator(task_id="update_version_or_not_serve", dag=dag)
t0.set_downstream(cleanup_branch)
cleanup_branch.set_downstream(t1)
cleanup_branch.set_downstream(t2)
t2.set_downstream(serve_branch)
serve_branch.set_downstream(t3)
serve_branch.set_downstream(t4)
Exemplo n.º 9
0
            'type': 'STRING'
        }, {
            'name': 'predicted_monetary',
            'type': 'FLOAT'
        }, {
            'name': 'predictions',
            'type': 'FLOAT'
        }],
        source_format="NEWLINE_DELIMITED_JSON",
        skip_leading_rows=1,
        destination_project_dataset_table="{}.{}.{}".format(
            PROJECT, dataset, 'predictions'),
        create_disposition="CREATE_IF_NEEDED",
        write_disposition="WRITE_TRUNCATE",
        dag=dag).execute(kwargs)


t3 = PythonOperator(task_id='list_predictions_files',
                    dag=dag,
                    python_callable=do_list_predictions_files)

t4 = PythonOperator(task_id='load_to_bq',
                    dag=dag,
                    python_callable=do_load_to_bq)

# How to link them
t0_predict_cond.set_downstream([t1a, t1b])
t2.set_upstream([t1a, t1b])
t3.set_upstream([t1a, t1b])
t3.set_downstream(t4)
Exemplo n.º 10
0
    f"{BASE_PACKAGE}.transactional-tables",
    "OutletsByDate",
    dag,
    RETAIL_ID,
    schema_name,
    ENV_TYPE,
)
items_by_date_task = bash_operator_for_spark_submit(
    f"{BASE_PACKAGE}.transactional-tables",
    "ItemsByDate",
    dag,
    RETAIL_ID,
    schema_name,
    ENV_TYPE,
)

push_instruments.set_downstream(push_server_details)
branch_task.set_upstream(push_server_details)
branch_task.set_downstream(master_tables_load)

branch_task.set_downstream(history_load_done)
master_tables_load.set_downstream(create_table_structure)
history_load_done.set_downstream(create_table_structure)
create_table_structure.set_downstream(unix_chmod_task)
unix_chmod_task.set_downstream(market_baskets_task)
market_baskets_task.set_downstream(
    [transaction_line_item_task, outlets_by_date_task, items_by_date_task])
data_load_done.set_upstream(
    [transaction_line_item_task, outlets_by_date_task, items_by_date_task])
create_constraint_task.set_upstream(data_load_done)
Exemplo n.º 11
0
def create_dag():
    dag = DAG(
        DAG_ID,
        default_args=default_args,
        # Be sure to stagger the dags so they don't run all at once,
        # possibly causing max memory usage and pod failure. - Stu M.
        schedule_interval='30 * * * *',
        catchup=False)
    with dag:
        start_task = DummyOperator(task_id='start')
        finish_task = DummyOperator(task_id='finish')

        for table, sources in table_map.items():
            pusher_task_id = f'schedule_dataflow_{table}'
            parsed_table = gcloud.parse_table_name(table)

            get_checkpoint_task = GetCheckpointOperator(
                task_id=f'get_checkpoint_{table}',
                env=env,
                target=table,
                sources=sources)

            continue_if_data_task = BranchPythonOperator(
                task_id=f'continue_if_data_{table}',
                python_callable=should_continue,
                op_args=[table],
                provide_context=True)

            parse_query_task = PythonOperator(task_id=f'parse_query_{table}',
                                              python_callable=parse_query,
                                              op_args=[table],
                                              provide_context=True)

            dataflow_task = ScheduleDataflowJobOperator(
                task_id=pusher_task_id,
                project=gcloud.project(env),
                template_name=f'load_lake_to_staging_{parsed_table}',
                job_name=f'lake-to-staging-{table}',
                job_parameters={'env': env},
                pull_parameters=[{
                    'param_name': 'query',
                    'task_id': f'parse_query_{table}'
                }],
                provide_context=True)

            monitor_dataflow_task = DataflowJobStateSensor(
                task_id=f'monitor_df_job_{table}',
                poke_interval=airflow_vars['dags']['lake_to_staging']
                ['poke_interval'],
                timeout=airflow_vars['dags']['lake_to_staging']
                ['poke_timeout'],
                dag=dag,
                pusher_task_id=pusher_task_id)

            set_checkpoint_task = SetCheckpointOperator(
                task_id=f'set_checkpoint_{table}', env=env, table=table)

            start_task.set_downstream(get_checkpoint_task)
            get_checkpoint_task.set_downstream(continue_if_data_task)
            continue_if_data_task.set_downstream(parse_query_task)
            parse_query_task.set_downstream(dataflow_task)
            dataflow_task.set_downstream(monitor_dataflow_task)
            monitor_dataflow_task.set_downstream(set_checkpoint_task)
            set_checkpoint_task.set_downstream(finish_task)

        start_task >> finish_task
    return dag
Exemplo n.º 12
0
    provide_context=True,
    dag=dag
)
create_source_id.set_upstream(source_data_sensor)


clean_data = HiveOperator(
    task_id='clean_data',
    hql=hql.HQL_CLEAN_DATA.format(source_id="{{ task_instance.xcom_pull(task_ids='create_source_id') }}",
                                  clean_mydata='clean_mydata', mydata='mydata'),
    schema='my_hive_db',
    provide_context=True,
    dag=dag
)
clean_data.set_upstream(create_source_id)
count_data_rows.set_downstream([stop_flow, clean_data])


move_data_mysql = PythonOperator(
    task_id='move_data_mysql',
    python_callable=tasks.move_data_mssql,
    templates_dict={'schema': 'my_hive_db'},
    provide_context=True,
    dag=dag
)
move_data_mysql.set_upstream(clean_data)


send_email = EmailOperator(
    task_id='send_email',
    to='*****@*****.**',
    mle.create_version(PROJECT, model_name, version_params)


# Checks if model exists using Hook instead of GCP operators due to conditional.
t10 = PythonOperator(task_id='list_versions',
                     dag=dag,
                     python_callable=do_list_versions)

# Creates model if it does not exist using Hook instead of GCP operators
t11 = PythonOperator(task_id='create_version',
                     dag=dag,
                     python_callable=do_create_version)

# Create task graph
t1.set_downstream(t2)
t2.set_downstream(t3)
t3.set_downstream(t4_train_cond)
t4_train_cond.set_downstream([t4_ml_engine, t4_automl])
t4_ml_engine.set_downstream([t4a, t4b, t4c])
t4_ml_engine.set_downstream(t5d)
t4a.set_downstream(t5a)
t4b.set_downstream(t5b)
t4c.set_downstream(t5c)
t6.set_upstream([t5a, t5b, t5c, t5d])
t6.set_downstream(t7)
t7.set_downstream(t8)
t9.set_upstream(t8)
t9.set_downstream(t10)
t10.set_downstream(t11)
Exemplo n.º 14
0
        return True
    else:
        return False


validate_data = BranchPythonOperator(
    task_id='Validate_Data',
    python_callable=lambda: "export_investor_report_s3"
    if is_data_valid() else "notify_validation_failure",
    dag=dag)

export_report = PostgresUnloaderOperator(
    task_id="export_investor_report_s3",
    dag=dag,
    postgres_conn_id=EDW.get_conn_id(),
    source=
    "select * from loanreview.loan_in_review WHERE cast(update_date as DATE) = current_date - 1",
    uri=EDW.get_s3_stage_uri(path="lir.csv"),
    execution_timeout=timedelta(minutes=30))

notify_slack_validation_fail = SlackAPIPostOperator(
    message="Validation failure. Cannot export {0} to s3".format(
        "investor report"),
    task_id="notify_validation_failure",
    dag=dag,
    execution_timeout=timedelta(minutes=30))

start.set_downstream(refresh_table)
refresh_table.set_downstream(validate_data)
validate_data.set_downstream([export_report, notify_slack_validation_fail])
Exemplo n.º 15
0
            python_callable=get_endpoint,
            op_args=[e, SAVE_PATH, BASE_URL, API_KEYS],
        )

        t_branch = BranchPythonOperator(task_id=branch_task_id,
                                        python_callable=row_count_branch,
                                        op_args=[
                                            get_enpdpoints_task_id,
                                            file_to_gcs_task_id,
                                            zero_branch_task_id
                                        ],
                                        trigger_rule="all_done")

        t_gcs = FileToGoogleCloudStorageOperator(
            task_id=file_to_gcs_task_id,
            google_cloud_storage_conn_id='gcs_silo',
            bucket="deanslist",
            src="{{ task_instance.xcom_pull(task_ids='" +
            get_enpdpoints_task_id + "', key='dl_file_path' )}}",
            #dst = "TEST/" + endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}",
            dst=endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" +
            get_enpdpoints_task_id + "', key='dl_file_name') }}",
            dag=dag)

        t_zero_row = DummyOperator(task_id=zero_branch_task_id)

        t2.set_upstream(t1)
        t2.set_downstream(t_branch)
        t_branch.set_downstream(t_gcs)
        t_branch.set_downstream(t_zero_row)
Exemplo n.º 16
0

# returns the week day (monday, tuesday, etc.)
def get_day(**kwargs):
    print(kwargs['ti'])
    kwargs['ti'].xcom_push(key='day', value=datetime.now().weekday())


# returns the name id of the task to launch (task_for_monday, task_for_tuesday, etc.)
def branch(**kwargs):
    print(kwargs)
    return 'task_for_' + tabDays[kwargs['ti'].xcom_pull(task_ids='weekday',
                                                        key='day')]


# PythonOperator will retrieve and store into "weekday" variable the week day
get_weekday = PythonOperator(task_id='weekday',
                             python_callable=get_day,
                             provide_context=True,
                             dag=dag)
# BranchPythonOperator will use "weekday" variable, and decide which task to launch next
fork = BranchPythonOperator(task_id='branching',
                            python_callable=branch,
                            provide_context=True,
                            dag=dag)
# task 1, get the week day
get_weekday.set_downstream(fork)
# One dummy operator for each week day, all branched to the fork
for day in range(0, 6):
    fork.set_downstream(
        DummyOperator(task_id='task_for_' + tabDays[day], dag=dag))
Exemplo n.º 17
0
    task_id='ingesting_row_data_HIVE',
    default_args=default_args,
    dag=dag,
)

#SparkSubmitOperator(
ingesting_graph_HBASE = DummyOperator(
    task_id='ingesting_graph_HBASE',
    default_args=default_args,
    dag=dag,
)


def decide_which_path():
    if True:
        return "branch_a"
    else:
        return "branch_b"


branch_checkForChanges = BranchPythonOperator(
    task_id='check_data_availablity',
    python_callable=decide_which_path,
    trigger_rule="all_done",
    dag=dag)

branch_checkForChanges.set_downstream(ingesting_row_data_HIVE)
branch_checkForChanges.set_downstream(sendNotification)
ingesting_row_data_HIVE.set_downstream(ingesting_graph_HBASE)
ingesting_graph_HBASE.set_downstream(sendNotification)
Exemplo n.º 18
0
branch_b = PythonOperator(
    task_id='branch_b',
    python_callable=print_branchb,
    dag=dag) # 指定归属的dag

def print_branchc():
    return 'Hello branchc!'
 
branch_c = PythonOperator(
    task_id='branch_c',
    python_callable=print_branchc,
    dag=dag) # 指定归属的dag
#-------------------------------------------------------------------------------
def decide_which_path():
      if 1 > 1:
          return "branch_a"
      else:
          return "branch_b"
  
  
branch_task = BranchPythonOperator(
      task_id='run_this_first',
      python_callable=decide_which_path,
      trigger_rule="all_done",
      dag=dag)
#-------------------------------------------------------------------------------
# dependencies
branch_task.set_downstream(branch_a) #适配层以及中间层、应用层都依赖于branch_a
branch_task.set_downstream(branch_b)
branch_a.set_downstream(branch_c)
                             python_callable=WriteToFile,
                             op_kwargs={
                                 'file_name': 'empty_log.log',
                                 'chk': False
                             },
                             provide_context=True,
                             email=['*****@*****.**'],
                             email_on_failure=True,
                             dag=dag)

collect_data = PythonOperator(task_id='collect_data',
                              depends_on_past=False,
                              python_callable=WriteToFile,
                              op_kwargs={
                                  'file_name': 'data_log.log',
                                  'chk': True
                              },
                              email=['*****@*****.**'],
                              provide_context=True,
                              email_on_failure=True,
                              dag=dag)

# Forking based on the condition
fork = BranchPythonOperator(task_id='branching',
                            python_callable=branching,
                            provide_context=True,
                            dag=dag)

check_data.set_downstream(fork)
fork.set_downstream(collect_data)
fork.set_downstream(empty_table)