dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw/sd_paving_imcat_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: send file update email to interested parties
send_last_file_updated_email = PoseidonEmailFileUpdatedOperator(
    task_id='send_last_file_updated',
    to='[email protected],[email protected],[email protected]',
    subject='IMCAT Streets File Updated',
    file_url='http://{}/{}'.format(conf['dest_s3_bucket'],
                                   'tsw/sd_paving_imcat_datasd.csv'),
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Execution order

#: streets_latest_only must run before get_streets_data
get_streets_data.set_upstream(streets_latest_only)

#: upload_streets_data is dependent on successful run of get_streets_data
upload_streets_data.set_upstream(get_streets_data)

#: email notification is sent after the data was uploaded to S3
send_last_file_updated_email.set_upstream(upload_streets_data)
示例#2
0
get_code_enf_files = PythonOperator(
    task_id='get_code_enf_files',
    python_callable=dfg.get_files,
    op_kwargs={'fname_list': fname_list,
               'target_dir': dsd_temp_dir},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_code_enf_md = get_seaboard_update_dag('code-enforcement-violations.md', dag)

#: Execution rules
#: dsd_code_enf_latest_only must run before get_code_enf_files
get_code_enf_files.set_upstream(dsd_ce_latest_only)


for i in fname_list:
    #: Create fme shell command
    build_csv_task = BashOperator(
        task_id='get_' + i,
        bash_command=get_bash_command(i),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Set Task as Downstream for downloading files
    build_csv_task.set_upstream(get_code_enf_files)
示例#3
0
#: Upload prod file to S3
cfs_to_S3 = S3FileTransferOperator(
    task_id='cfs_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='pd_calls_for_service_'+curr_year+'_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='pd/pd_calls_for_service_'+curr_year+'_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_pd_cfs_md = get_seaboard_update_dag('police-calls-for-service.md', dag)

#: Execution rules:

#: pd_cfs_latest_only must run before pd_cfs_data
get_cfs_data.set_upstream(pd_cfs_latest_only)

#: Data processing is triggered after data retrieval.
process_cfs_data.set_upstream(get_cfs_data)

#: Data upload to S3 is triggered after data processing completion.
cfs_to_S3.set_upstream(process_cfs_data)

#: Github update depends on S3 upload success.
update_pd_cfs_md.set_upstream(cfs_to_S3)
示例#4
0
    return None  # load new data to mongodb


load_new_data_task = PythonOperator(
    task_id='load_new_data',
    python_callable=load_new_data,
    dag=dag)


def extract_type(ds, **kwargs):
    year, month, day = ds.split('-')  # 2016-04-22
    c_ds = "%s/%s/%s" % (day, month, year)  # 15/12/2014
    count = 0
    tp = kwargs['tp']
    keyword = kwargs['keyword']
    for andamento in Andamentos.objects(data=c_ds):
        texto_lw = andamento.texto.lower()
        if keyword in texto_lw:
            andamento.tipo = tp
            andamento.save()
            count += 1
    return count


for tp in PROGRESS_TYPES:
    extract_tipo_task = PythonOperator(
        task_id='extract_%s_task' % (tp,),
        python_callable=extract_type, op_kwargs={'tp': tp, 'keyword': PROGRESS_TYPES[tp]},
        dag=dag, provide_context=True)
    extract_tipo_task.set_upstream(load_new_data_task)
   on_success_callback=notify,
   dag=dag)



#: update permits.md file
update_permits_md = get_seaboard_update_dag('permits.md', dag)

#: update permits.md file
update_solar_md = get_seaboard_update_dag('solar-permits.md', dag)


#: Execution rules

#: dsd_permits_latest_only must run before get_permits_files
get_permits_files.set_upstream(dsd_permits_latest_only)

#: clean_data tasks are executed after get_approvals_files task
clean_data.set_upstream(get_permits_files)

#: upload_dsd tasks are executed after clean_data tasks
join_bids.set_upstream(clean_data)

#: subset_solar tasks are executed after clean_data tasks
subset_solar.set_upstream(join_bids)

#: upload_dsd tasks are executed after subset_solar tasks
upload_dsd_permits.set_upstream(subset_solar)

#: upload_dsd tasks are executed after clean_data tasks
upload_solar_permits.set_upstream(subset_solar)
示例#6
0
    sleep_time = context['task_instance'].xcom_pull(key='sleep')['b']
    time.sleep(sleep_time)

    # push by returning
    return 2


def ab_task(**context):
    a, b = context['task_instance'].xcom_pull(task_ids=['a_task', 'b_task'])
    print('{} + {}: {}'.format(a, b, a + b))
    return (a + b)


dag = DAG('xcom_add', description='Simple tutorial DAG',
          schedule_interval=None,
          start_date=datetime(2018, 9, 3),
          catchup=False)


t_src = PythonOperator(task_id='source_task', python_callable=source_task, provide_context=True, dag=dag)
t_a = PythonOperator(task_id='a_task', python_callable=a_task, provide_context=True, dag=dag)
t_b = PythonOperator(task_id='b_task', python_callable=b_task, provide_context=True, dag=dag)
t_ab = PythonOperator(task_id='ab_task', python_callable=ab_task, provide_context=True, dag=dag)


t_src.set_downstream(t_a)
t_src.set_downstream(t_b)

t_ab.set_upstream(t_a)
t_ab.set_upstream(t_b)
	provide_context=True,
	python_callable=clear_export_folder,
	dag=dag
    )

export_athena_scifi_table = AWSAthenaOperator(
    task_id="export_athena_scifi_table",
    #query=export_athena_scifi_table_query,
    query=export_athena_scifi_table_query2, 
    workgroup = "devday-demo", 
    database=athena_db,
    sleep_time = 60,
    output_location='s3://'+s3_dlake+"/"+athena_output+'export_athena_scifi_table'
    )


export_scifi_tofile = PythonOperator (
    task_id='export_scifi_tofile',
	provide_context=True,
	python_callable=export_scifi_tofile,
	dag=dag
    )

check_athena_export_table.set_upstream(disp_variables)
drop_athena_export_table.set_upstream(check_athena_export_table)
check_athena_export_table_done.set_upstream(check_athena_export_table)
check_athena_export_table_pass.set_upstream(drop_athena_export_table)
check_athena_export_table_pass.set_upstream(check_athena_export_table_done)
export_athena_scifi_table.set_upstream(clear_export_folder)
clear_export_folder.set_upstream(check_athena_export_table_pass)
export_scifi_tofile.set_upstream(export_athena_scifi_table)
示例#8
0
    df = df.astype('float64')
    df.to_csv('dags/c2k_final.csv')


default_args = {
    'owner': 'Israel Z',
    'start_date': dt.datetime(2018, 5, 9),
    'retries': 1,
    'retry_delay': dt.timedelta(minutes=5),
}

with DAG('flow_pandas',
         default_args=default_args,
         schedule_interval='*/10 * * * *',
         ) as dag:

    download = PythonOperator(task_id='download',
                                 python_callable=download)
    dropn = PythonOperator(task_id='dropn',
                                 python_callable=dropn)
    fill = PythonOperator(task_id='fill',
                                 python_callable=fill)
    cast = PythonOperator(task_id='cast',
                                 python_callable=cast)

# Dependencies

dropn.set_upstream(download)
fill.set_upstream(dropn)
cast.set_upstream(fill)
示例#9
0
dag = DAG(dag_id='sire_docs', default_args=args, start_date=start_date, schedule_interval=schedule)

sire_docs_latest_only = LatestOnlyOperator(task_id='sire_docs_latest_only', dag=dag)

#: Get sire tables
get_doc_tables = PythonOperator(
    task_id='get_sire_tables',
    python_callable=get_sire,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Execution rules
#: sire_docs_latest_only must run before get_doc_tables
get_doc_tables.set_upstream(sire_docs_latest_only)

files = [f for f in os.listdir(conf['prod_data_dir'])]
for f in files:
    if f.split('_')[0] == "sire":
        #: Upload sire prod files to S3
        upload_doc_tables = S3FileTransferOperator(
            task_id='upload_{}'.format(f),
            source_base_path=conf['prod_data_dir'],
            source_key=f,
            dest_s3_conn_id=conf['default_s3_conn_id'],
            dest_s3_bucket=conf['dest_s3_bucket'],
            dest_s3_key='city_docs/{}'.format(f),
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_parking_trans_md = get_seaboard_update_dag('parking-meters-transactions.md', dag)

#: Execution Rules

#: parking_meters_latest_only must run before get_parking_files
get_parking_files.set_upstream(parking_meters_latest_only)

#: Download Files, build prod file.

#: build_prod_file depends on get_parking_files:
build_prod_file.set_upstream(get_parking_files)

#: Upload Prod File

#: upload_prod_file depends on build_prod_file
upload_prod_file.set_upstream(build_prod_file)

#: Build Aggs

#: build_by_month_aggregation depends on build_prod_file:
build_by_month_aggregation.set_upstream(build_prod_file)

#: build_by_day_aggregation depends on build_prod_file:
build_by_day_aggregation.set_upstream(build_prod_file)

#: Upload Aggs
示例#11
0
    on_success_callback=notify,
    dag=dag)

#: Upload prod cip_datasd.csv file to S3
upload_cip_data = S3FileTransferOperator(
    task_id='upload_cip_data',
    source_base_path=conf['prod_data_dir'],
    source_key='cip_{0}_datasd.csv'.format(fiscal_yr),
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='cip/' + 'cip_{0}_datasd.csv'.format(fiscal_yr),
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)


#: Update portal modified date
update_cip_md = get_seaboard_update_dag('cip.md', dag)

#: Execution order
#: cip_latest_only must run before get_cip_data
get_cip_data.set_upstream(cip_latest_only)

#: upload_cip_data is dependent on successful run of get_cip_data
upload_cip_data.set_upstream(get_cip_data)

#: upload_cip_data must succeed before updating github
update_cip_md.set_upstream(upload_cip_data)
示例#12
0
date_operator = BashOperator(
    task_id='date_task',
    bash_command='date',
    dag=dag)

#-------------------------------------------------------------------------------
# second operator

sleep_operator = BashOperator(
    task_id='sleep_task',
    depends_on_past=False,
    bash_command='sleep 5',
    dag=dag)

#-------------------------------------------------------------------------------
# third operator

def print_hello():
    return 'Hello world!'

hello_operator = PythonOperator(
    task_id='hello_task',
    python_callable=print_hello,
    dag=dag)

#-------------------------------------------------------------------------------
# dependencies

sleep_operator.set_upstream(date_operator)
hello_operator.set_upstream(date_operator)
    source_base_path=conf['prod_data_dir'],
    source_key='stormwater_violations_merged.geojson',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw_int/stormwater_violations_merged.geojson',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)





#: Execution rules
# Get VPM violations runs after latest only
get_vpm_violations.set_upstream(violations_latest_only)
# Get salesforce violations runs after latest only
get_sf_violations.set_upstream(violations_latest_only)
# Get pts violations runs after latest only
get_pts_violations.set_upstream(violations_latest_only)
# SW Violations merge runs after get_pts and get_sf
combine_sw_violations.set_upstream(get_sf_violations)
combine_sw_violations.set_upstream(get_pts_violations)
combine_sw_violations.set_upstream(get_vpm_violations)
# Upload of CSV happens after combine
violations_csv_to_s3.set_upstream(combine_sw_violations)
violations_geojson_to_s3.set_upstream(combine_sw_violations)
violations_csv_null_geos_to_s3.set_upstream(combine_sw_violations)
示例#14
0
    get_task = PythonOperator(
        task_id='get_' + machine_service_name,
        python_callable=get_requests_service_name,
        op_kwargs={
            'service_name': service_name,
            'machine_service_name': machine_service_name
        },
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    service_tasks.append(get_task)

    #: join_council_districts must run before get_task
    get_task.set_upstream(create_prod_files)

    if i == 'pothole':
        #: get_task must run before sonar potholes
        get_task.set_downstream(create_potholes_sonar)

filename = conf['prod_data_dir'] + "/get_it_done_*.csv"
files = [os.path.basename(x) for x in glob.glob(filename)]

for index, file_ in enumerate(files):
    file_name = file_.split('.')[0]
    name_parts = file_name.split('_')
    task_name = '_'.join(name_parts[3:-2])
    md_name = '-'.join(name_parts[3:-2])

    #: Upload prod gid file to S3
示例#15
0
#: Upload prod file to S3
collisions_to_S3 = S3FileTransferOperator(
    task_id='collisions_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='pd_collisions_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='pd/pd_collisions_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_pd_cls_md = get_seaboard_update_dag('police-collisions.md', dag)

#: Execution rules:

#: pd_col_latest_only must run before get_collisions_data
get_collisions_data.set_upstream(pd_col_latest_only)

#: Data processing is triggered after data retrieval.
process_collisions_data.set_upstream(get_collisions_data)

#: Data upload to S3 is triggered after data processing completion.
collisions_to_S3.set_upstream(process_collisions_data)

#: Github update depends on S3 upload success.
update_pd_cls_md.set_upstream(collisions_to_S3)
示例#16
0
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='sde/tree_canopy_datasd.pbf',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_gis_md = get_seaboard_update_dag('tree-canopy-2014.md', dag)

#: Execution order

#: Latest only operator must run before getting tree canopy data
get_shapefiles.set_upstream(treecan_latest_only)

#: get_shapefiles must run before converting to geojson
shp_to_geojson.set_upstream(get_shapefiles)

#: to_geojson must run before converting to geobuf
geojson_to_geobuf.set_upstream(shp_to_geojson)

#: to_geobuf must run before zipping geobuf
geobuf_zip.set_upstream(geojson_to_geobuf)

#: get_shapefile must run before zipping shapefile
shape_zip.set_upstream(get_shapefiles)

#: zipping shapefile must run before uploading
upload_shp_file.set_upstream(shape_zip)
示例#17
0
def push_by_returning(**kwargs):
    # pushes an XCom without a specific target, just by returning it
    return value_2


def puller(**kwargs):
    ti = kwargs['ti']

    # get value_1
    v1 = ti.xcom_pull(key=None, task_ids='push')
    assert v1 == value_1

    # get value_2
    v2 = ti.xcom_pull(task_ids='push_by_returning')
    assert v2 == value_2

    # get both value_1 and value_2
    v1, v2 = ti.xcom_pull(key=None, task_ids=['push', 'push_by_returning'])
    assert (v1, v2) == (value_1, value_2)


push1 = PythonOperator(task_id='push', dag=dag, python_callable=push)

push2 = PythonOperator(task_id='push_by_returning',
                       dag=dag,
                       python_callable=push_by_returning)

pull = PythonOperator(task_id='puller', dag=dag, python_callable=puller)

pull.set_upstream([push1, push2])
示例#18
0
)
email_task = PythonOperator(
    task_id='email_currency',
    python_callable=email_currency,    
    dag=dag
)

yesterday_task = PythonOperator(
    task_id='yesterday_currency',
    python_callable=get_yesterdays_data,
    op_kwargs={
        "engine": PSQL_CONN,
    },    
    dag=dag
)

yesterday_upsert = PythonOperator(
    task_id='yesterday_upsert',
    python_callable=upsert_yesterdays_data,
    op_kwargs={
        "engine": PSQL_CONN,
    },    
    dag=dag
)

pull_task.set_downstream(run_task)
run_task.set_downstream(email_task)
yesterday_upsert.set_downstream(yesterday_task)
yesterday_upsert.set_upstream(run_task)
yesterday_task.set_upstream(run_task)
yesterday_task.set_downstream(email_task)
示例#19
0
            task_id = get_enpdpoints_task_id,
            python_callable = get_endpoint_with_dates,
            op_args = [SAVE_PATH, BASE_URL, API_KEYS],
            templates_dict = ep_template
            )

    t_branch = BranchPythonOperator(
        task_id = branch_task_id,
        python_callable = row_count_branch,
        op_args = [get_enpdpoints_task_id, file_to_gcs_task_id, zero_branch_task_id],
        trigger_rule = "all_done"
        )

    t_gcs = FileToGoogleCloudStorageOperator(
         task_id = file_to_gcs_task_id,
         google_cloud_storage_conn_id = 'gcs_silo',
         bucket = "deanslist",
         src =  "{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_path' )}}",
         dst = endpoint_name + "/{{ task_instance.xcom_pull(task_ids='" + get_enpdpoints_task_id + "', key='dl_file_name') }}",
         dag = dag
    )

    t_zero_row = DummyOperator(
        task_id =zero_branch_task_id
        )

    t2.set_upstream(t1)
    t2.set_downstream(t_branch)
    t_branch.set_downstream(t_gcs)
    t_branch.set_downstream(t_zero_row)
示例#20
0
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('s3_dag_test', default_args=default_args, schedule_interval='@once')

t1 = BashOperator(task_id='bash_test',
                  bash_command='echo "hello, it should work"',
                  dag=dag)

sensor = S3KeySensor(task_id='check_s3_for_file_in_s3',
                     bucket_key='logs*',
                     wildcard_match=True,
                     bucket_name='airflow-logs-ben',
                     aws_conn_id='my_conn_S3',
                     timeout=18 * 60 * 60,
                     poke_interval=120,
                     dag=dag)

pythonop = PythonOperator(start_date=datetime(2016, 11, 1),
                          python_callable=check_it,
                          task_id="my_python_check")
pythonop_write = PythonOperator(start_date=datetime(2016, 11, 1),
                                python_callable=write_it,
                                op_args=["me"],
                                task_id="my_python_write")

t1.set_upstream(sensor)
pythonop.set_upstream(sensor)
pythonop_write.set_upstream(sensor)
示例#21
0
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

delta = timedelta(seconds=3)
dag = DAG('test_delete_dag',
          default_args=default_args,
          schedule_interval=delta)

# t1, t2 and t3 are examples of tasks created by instantiating operators
t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)


def py_callable(*args, **kwargs):
    print "args = "
    print args
    print "kwargs = "
    print kwargs


t3 = PythonOperator(task_id='py_callable',
                    python_callable=py_callable,
                    op_args=['dogs'],
                    op_kwargs={'cats': 20},
                    provide_context=True,
                    dag=dag)

t3.set_upstream(t1)
示例#22
0
                  df_R,
                  on=['country_region', 'province_state', 'event_date'])
    df = pd.merge(df,
                  df_D,
                  on=['country_region', 'province_state', 'event_date'])
    df['mortality_rate'] = df['d_cases'] / df['c_cases']
    df['recovery_rate'] = df['r_cases'] / df['c_cases']
    #df_final = df[COLUMNS_VIEW]
    df_final = df
    with db_connection.begin() as transaction:
        transaction.execute("DELETE FROM covid.cases_data WHERE 1=1")
        df_final.to_sql("cases_data",
                        con=transaction,
                        schema="covid",
                        if_exists="append",
                        index=False)


integration_procces = PythonOperator(dag=dag,
                                     task_id="integration_procces",
                                     provide_context=True,
                                     python_callable=integration_procces)

file_sensor_task_C >> transform_process_C >> insert_process_C
file_sensor_task_R >> transform_process_R >> insert_process_R
file_sensor_task_D >> transform_process_D >> insert_process_D

#[insert_process_C,insert_process_R,insert_process_D] >> integration_procces
integration_procces.set_upstream(insert_process_C)
integration_procces.set_upstream(insert_process_R)
integration_procces.set_upstream(insert_process_D)
示例#23
0
    # pushes an XCom without a specific target, just by returning it
    return value_2


def puller(**kwargs):
    ti = kwargs['ti']

    # get value_1
    v1 = ti.xcom_pull(key=None, task_ids='push')
    assert v1 == value_1

    # get value_2
    v2 = ti.xcom_pull(task_ids='push_by_returning')
    assert v2 == value_2

    # get both value_1 and value_2
    v1, v2 = ti.xcom_pull(key=None, task_ids=['push', 'push_by_returning'])
    assert (v1, v2) == (value_1, value_2)


push1 = PythonOperator(
    task_id='push', dag=dag, python_callable=push)

push2 = PythonOperator(
    task_id='push_by_returning', dag=dag, python_callable=push_by_returning)

pull = PythonOperator(
    task_id='puller', dag=dag, python_callable=puller)

pull.set_upstream([push1, push2])
run_this = PythonOperator(
   task_id='Start',
   provide_context=True,
   python_callable=print_context,
   dag=dag,
)

#1
# execute_impala_by_sql_file('forecast_store_code_scope_sprint4',\
#                            './sqls/1.forecast_store_code_scope_sprint4.sql')
step1 = PythonOperator(task_id="step1",
                              python_callable=execute_impala_by_sql_file,
                              op_kwargs={'table_name': "forecast_store_code_scope_sprint4",
                                 'file_path':f'{config["parent_path"]}/data_preperation/data_aggregation/regular_item/1.forecast_store_code_scope_sprint4.sql'},
                              dag=dag)
step1.set_upstream(run_this)

#2
# execute_impala_by_sql_file('forecast_itemid_list_threebrands_sprint4',\
#                            './sqls/2.forecast_itemid_list_threebrands_sprint4.sql')
step2 = PythonOperator(task_id="step2",
                              python_callable=execute_impala_by_sql_file,
                              provide_context=True,
                              op_kwargs={'table_name': "forecast_itemid_list_threebrands_sprint4",
                                 'file_path':f'{config["parent_path"]}/data_preperation/data_aggregation/regular_item/2.forecast_itemid_list_threebrands_sprint4.sql',
                                 'set_timeperiod':True},
                              dag=dag)
step2.set_upstream(step1)

#3
# execute_impala_by_sql_file('forecast_item_id_family_codes_sprint4',\
示例#25
0

check_updates_with_judges_task = PythonOperator(
    task_id='check_updates_with_judges',
    python_callable=check_updates_with_judges,
    dag=dag)


def extract_name():
    # TODO: Criar função para extrair o nome do juiz do texto
    return None  # http://blog.yhat.com/posts/named-entities-in-law-and-order-using-nlp.html


def check_name():
    # TODO: Verificar o nome extraido
    return None  # Validar com uma base de nomes de JUIZES (portal da transparencia)


extract_name_task = PythonOperator(
    task_id='extract_name_task',
    python_callable=extract_name,
    dag=dag)

check_name_task = PythonOperator(
    task_id='check_name_task',
    python_callable=check_name,
    dag=dag)

extract_name_task.set_upstream(check_updates_with_judges_task)
check_name_task.set_upstream(extract_name_task)
示例#26
0
}

dag = DAG(**dag_params)

clean = PythonOperator(task_id='clean', python_callable=executor.clean, dag=dag)

check_connect = PythonOperator(task_id='check_connect', python_callable=executor.check_connect, dag=dag)

backup_docs = PythonOperator(task_id='backup_docs', python_callable=executor.backup_docs, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
backup_trans = PythonOperator(task_id='backup_trans', python_callable=executor.backup_trans, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
backup_performance = PythonOperator(task_id='backup_performance', python_callable=executor.backup_performance, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
backup_fields = PythonOperator(task_id='backup_fields', python_callable=executor.backup_field, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)


dim_field = PythonOperator(task_id='dim_field', python_callable=executor.dim_field, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
fact_document = PythonOperator(task_id='fact_document', python_callable=executor.fact_document, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
fact_performance = PythonOperator(task_id='fact_performance', python_callable=executor.fact_performance, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)
fact_data_extract = PythonOperator(task_id='fact_data_extract', python_callable=executor.fact_data_extract, dag=dag, trigger_rule=TriggerRule.ALL_SUCCESS)

report = PythonOperator(task_id='report', python_callable=executor.report, dag=dag, trigger_rule=TriggerRule.ALL_DONE)



clean >> check_connect >> [backup_docs, backup_trans, backup_performance, backup_fields]

dim_field.set_upstream(backup_fields)
fact_document.set_upstream([backup_docs, backup_trans])
fact_performance.set_upstream([backup_performance, fact_document])
fact_data_extract.set_upstream([fact_document, fact_performance])

[dim_field, fact_document, fact_performance, fact_data_extract] >> report

def my_sleeping_function(random_base):
    '''This is a function that will run within the DAG execution'''
    time.sleep(random_base)


def print_context(ds, **kwargs):
    pprint(kwargs)
    print(ds)
    return 'Whatever you return gets printed in the logs'

run_this = PythonOperator(
    task_id='print_the_context',
    provide_context=True,
    python_callable=print_context,
    dag=dag)

for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(
        task_id='sleep_for_'+str(i),
        python_callable=my_sleeping_function,
        op_kwargs={'random_base': float(i)/10},
        dag=dag)

    task.set_upstream(run_this)
    task_id='dsd_approvals_latest_only', dag=dag)

#: Get most recent weekly permit approvals reports
get_approvals_files = PythonOperator(
    task_id='get_approvals_files',
    python_callable=dfg.get_files,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    op_kwargs={'fname_list': fnames,
               'target_dir': dsd_temp_dir},
    dag=dag)


#: dsd_approvals_latest_only must run before get_approvals_files
get_approvals_files.set_upstream(dsd_approvals_latest_only)

#: update github modified date (solar permits)
update_solar_md = get_seaboard_update_dag('solar-permits.md', dag)

for key in app.approval_dict:

    #: Consolidate weekly permitting data by scraping OpenDSD API
    scrape_dsd = PythonOperator(
        task_id='scrape_dsd_' + key,
        python_callable=app.scrape_dsd,
        op_kwargs={'key': key},
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)
#: Uploads the generated production file
upload_traffic_counts = S3FileTransferOperator(
    task_id='upload_traffic_counts',
    source_base_path=conf['prod_data_dir'],
    source_key='traffic_counts_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='traffic_counts/traffic_counts_datasd.csv',
    replace=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_traffic_md = get_seaboard_update_dag('traffic-volumes.md', dag)

#: Execution Rules

#: traffic_counts_latest_only must run before get_traffic_counts
get_traffic_counts.set_upstream(tc_latest_only)
#: Cleaning task triggered after data retrieval.
clean_traffic_counts.set_upstream(get_traffic_counts)
#: Production build task triggered after cleaning task.
build_traffic_counts.set_upstream(clean_traffic_counts)
#: Data upload to S3 triggered after production build task.
upload_traffic_counts.set_upstream(build_traffic_counts)
#: Update .md file after S3 upload
update_traffic_md.set_upstream(upload_traffic_counts)
示例#30
0
    source_key='latest_indicator_bac_tests_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='water_testing/latest_indicator_bac_tests_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_water_md = get_seaboard_update_dag(
    'monitoring-of-indicator-bacteria-in-drinking-water.md',
    dag)

#: Execution Rules

#: water_latest_only must run before get_indicator_bac_tests
get_indicator_bac_tests.set_upstream(wtr_latest_only)
#: Upload indicator bac tests after it has successfully run
upload_indicator_bac_tests.set_upstream(get_indicator_bac_tests)

#: get_last_bac_tests is dependent on get_indicator_bac_tests
get_latest_bac_tests.set_upstream(get_indicator_bac_tests)

#: Upload latest indicator bac tests after the file has been generated
upload_latest_indicator_bac_tests.set_upstream(get_latest_bac_tests)

#: update .md file after S3 upload
update_water_md.set_upstream(upload_latest_indicator_bac_tests)
# Convert input CSV files
for table in SECMAR_TABLES + ["operations_valides"]:
    t = PythonOperator(
        task_id="transform_" + table,
        python_callable=secmar_transform,
        provide_context=True,
        dag=dag,
        pool="transform",
        op_kwargs={
            "in_path": in_path(table),
            "out_path": out_path(table),
            "transformer": secmar_transformer(table),
        },
    )
    t.set_upstream(start)
    t.set_downstream(end_transform)

create_tables = PythonOperator(
    task_id="create_tables",
    python_callable=create_tables_fn,
    provide_context=True,
    dag=dag,
)
create_tables.set_upstream(end_transform)

# Import CSV files into PostgreSQL
embulk_operations = embulk_import(dag, "operations")
embulk_operations.set_upstream(create_tables)
embulk_operations.set_downstream(end_import)
示例#32
0
            "flood-monitoring/archive/" + filename, filename)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("object doesn't exist")
        else:
            raise
    return filename


def insertDailyDump(cassandra, credentials, **kwargs):
    yesterday_ds = kwargs['yesterday_ds']
    filename = downloadDatafile(yesterday_ds, credentials)
    putInCassandra(filename, cassandra)
    os.remove(filename)


connection = BaseHook.get_connection("s3_conn")
extra = connection.extra
parsed_extra = json.loads(extra)
cassandra = BaseHook.get_connection("cassandra_connection")
putIn = PythonOperator(task_id='put_in_cassandra',
                       python_callable=insertDailyDump,
                       provide_context=True,
                       op_kwargs={
                           'cassandra': cassandra,
                           'credentials': parsed_extra
                       },
                       dag=dag)

putIn.set_upstream(s3ready)
示例#33
0
    with open(output_path, "w") as fp:
        fp.write(input_value)
        fp.write("\n\n")
        fp.write(str(datetime.now().strftime("%Y-%m-%dT%H:%M:%S")))
    return "success"


with DAG(dag_id="dbnd_operators", default_args=default_args) as dag_operators:
    # t1, t2 and t3 are examples of tasks created by instantiating operators
    t1 = my_task(2)
    t2, t3 = my_multiple_outputs(t1)
    tp = PythonOperator(
        task_id="some_python_function",
        python_callable=some_python_function,
        op_kwargs={
            "input_path": t3,
            "output_path": "/tmp/output.txt"
        },
    )
    tp.set_upstream(t3.op)

    t1_op = t1.op

if __name__ == "__main__":
    ti = TaskInstance(t1_op, days_ago(0))
    ti.run(ignore_task_deps=True, ignore_ti_state=True, test_mode=True)
    # #
    #
    # dag_operators.clear()
    # dag_operators.run()
# Task JOIN


def join_data():
    filtro = pd.read_csv(data_path + 'enade_filtro.csv')
    idadecent = pd.read_csv(data_path + 'idadecent.csv')
    idadequadrado = pd.read_csv(data_path + 'idadequadrado.csv')
    estcivil = pd.read_csv(data_path + 'estcivil.csv')
    cor = pd.read_csv(data_path + 'cor.csv')

    final = pd.concat([filtro, idadecent, idadequadrado, estcivil, cor],
                      axis=1)  # realizar o concat por coluna

    final.to_csv(data_path + 'enade_tratado.csv', index=False)
    print(final)


# PythonOperator para o - join_data
task_join = PythonOperator(task_id='join_data',
                           python_callable=join_data,
                           dag=dag)

# Definindo o encadeamento da execução
start_processing >> get_data >> unzip_file >> task_aplica_filtro

task_aplica_filtro >> [task_idade_cent, task_est_civil, task_cor]

task_idade_quad.set_upstream(task_idade_cent)

task_join.set_upstream([task_est_civil, task_cor, task_idade_quad])
    pyop_unzip_file = PythonOperator(task_id='unzip_file',
                                     python_callable=unzip_file)

    pyop_filter_data = PythonOperator(task_id='filter_data',
                                      python_callable=filter_data)

    pyop_mean_normalize_age = PythonOperator(
        task_id='mean_normalize_age', python_callable=get_mean_normalized_age)

    pyop_squared_mean_normalize_age = PythonOperator(
        task_id='squared_mean_normalize_age',
        python_callable=get_squared_mean_normalized_age)

    pyop_marital_status = PythonOperator(task_id='marital_status',
                                         python_callable=get_marital_status)

    pyop_skin_color = PythonOperator(task_id='skin_color',
                                     python_callable=get_skin_color)

    pyop_join_data = PythonOperator(task_id='join_data',
                                    python_callable=join_data)

    baop_start_process >> baop_get_data >> pyop_unzip_file >> pyop_filter_data
    pyop_filter_data >> pyop_mean_normalize_age >> pyop_squared_mean_normalize_age
    pyop_filter_data >> [pyop_marital_status, pyop_skin_color]

    pyop_join_data.set_upstream([
        pyop_squared_mean_normalize_age, pyop_marital_status, pyop_skin_color
    ])
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload prod art file to S3
upload_public_art = S3FileTransferOperator(
    task_id='upload_public_art',
    source_base_path=conf['prod_data_dir'],
    source_key='public_art_locations_datasd.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='public_art/public_art_locations_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_public_art_md = get_seaboard_update_dag('public-art.md', dag)


#: Execution rules
#: public_art_latest_only must run before get_public_art
get_public_art.set_upstream(public_art_latest_only)
#: get_public_art must run before file upload
upload_public_art.set_upstream(get_public_art)
#: upload_gid_requests must succeed before updating github
update_public_art_md.set_upstream(upload_public_art)
    print("All target DAGs are finished")


args = {
    'start_date': days_ago(1),
    'owner': 'airflow',
}

dag = DAG(
    dag_id='trigger_with_multi_dagrun_sensor',
    max_active_runs=1,
    schedule_interval='@hourly',
    default_args=args,
)

gen_target_dag_run = TriggerMultiDagRunOperator(
    task_id='gen_target_dag_run',
    dag=dag,
    trigger_dag_id='common_target',
    python_callable=generate_dag_run,
)

# Wait until there is no running instance of target DAG
wait_target_dag = MultiDagRunSensor(task_id='wait_target_dag', dag=dag)
wait_target_dag.set_upstream(gen_target_dag_run)

after_dags_handler_op = PythonOperator(task_id='after_dags_handler',
                                       python_callable=after_dags_handler,
                                       dag=dag)
after_dags_handler_op.set_upstream(wait_target_dag)
示例#38
0
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'indicator_bacteria_monitoring'},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_water_md = get_seaboard_update_dag(
    'monitoring-of-indicator-bacteria-in-drinking-water.md', dag)

#: Execution Rules

#: water_latest_only must run before get_indicator_bac_tests
get_indicator_bac_tests.set_upstream(wtr_latest_only)
#: Upload indicator bac tests after it has successfully run
upload_indicator_bac_tests.set_upstream(get_indicator_bac_tests)

#: get_last_bac_tests is dependent on get_indicator_bac_tests
get_latest_bac_tests.set_upstream(get_indicator_bac_tests)

#: Upload latest indicator bac tests after the file has been generated
upload_latest_indicator_bac_tests.set_upstream(get_latest_bac_tests)

#: update .md file after S3 upload
update_water_md.set_upstream(upload_latest_indicator_bac_tests)

#: upload data must succeed before updating json
update_json_date.set_upstream(upload_latest_indicator_bac_tests)
示例#39
0
    provide_context=True,
    python_callable=load_rea_config,
    dag=dag,
)

expression_filter = PythonOperator(
    task_id='exp_filter',
    provide_context=True,
    python_callable=base_func['expression_filter'],
    dag=dag,
    params=PARAM_OBJ.params(base_func['expression_filter']),
)

expression_filter_test = PythonOperator(
    task_id='exp_filter',
    provide_context=True,
    python_callable=expression_filter_test,
    dag=dag,
    params=PARAM_OBJ.params(base_func['expression_filter']),
)

# end_task = PythonOperator(
#     task_id='dag_end',
#     provide_context=True,
#     python_callable=dag_end,
#     dag=dag
# )

# end_task.set_upstream(load_config)
expression_filter_test.set_upstream(load_config)
	token    = response.headers['X-ARC-Token']
	headers  = {'X-ARC-Token': token}
	payload  = {'uuid': '##UUID##', 'level': level, 'message': msg}
	response = requests.post(API_HOST+'/api/v1/podevent', headers=headers, json=payload)

def preamble(ds, **kwargs):
	print('PREAMBLE ------------------------------------------------------------------------')
	create_podevent('Starting ##PHASE## workflow for POD ##UUID##')

def postamble(ds, **kwargs):
	print('POSTAMBLE ------------------------------------------------------------------------')
	create_podevent('Finishing ##PHASE## workflow for POD ##UUID##')
	create_podevent('State changed to: ACTIVE', level='STATUS')

def failure(ds, **kwargs):
	print('POSTAMBLE ------------------------------------------------------------------------')
	create_podevent('Finishing ##PHASE## workflow for POD ##UUID##, Failed')
	create_podevent('State changed to: FAILED', level='STATUS')

t1 = PythonOperator(task_id='preamble',  provide_context=True, python_callable=preamble, dag=dag)

t2 = PythonOperator(task_id='maintask',  provide_context=True, python_callable=##WFNAME##.start, dag=dag)

t3 = PythonOperator(task_id='postamble', provide_context=True, python_callable=postamble, dag=dag)

t4 = PythonOperator(task_id='failure', provide_context=True, python_callable=failure, dag=dag, trigger_rule='all_failed')

t2.set_upstream(t1)
t3.set_upstream(t2)
t4.set_upstream(t2)
示例#41
0
train_config = training_config(estimator=byoc_est, inputs=inputs)


# step - trigger CDK to deploy model as ECS service using Airflow Python Operator
def dkn_model_deploy(data, **context):
    print("mock for dkn deployment")


default_args = {
    'owner': 'airflow',
    'start_date': airflow.utils.dates.days_ago(2),
    'provide_context': True
}

dag = DAG('tensorflow_example',
          default_args=default_args,
          schedule_interval='@once')

train_op = SageMakerTrainingOperator(task_id='tf_training',
                                     config=train_config,
                                     wait_for_completion=True,
                                     dag=dag)

deploy_op = PythonOperator(task_id='model_deploy',
                           python_callable=dkn_model_deploy,
                           op_args=['gw1', 'gw2'],
                           provide_context=True,
                           dag=dag)

deploy_op.set_upstream(train_op)
    channel=slack_channel,
    username='******',
    text='Cluster has been *restarted!*\n'
         'It\'s all fine move forward with your ETLs and Crawlers!\n'
         'Message datetime: {{params.curr_date}}',
    params={'curr_date': str(datetime.now(pytz.timezone('America/Sao_Paulo')))},
    dag=dag
)

run_etl_crawler_cluster_up = SubDagOperator(
  subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_up', dag.schedule_interval),
  task_id='crawler_dag_cluster_up',
  dag=dag,
)

run_etl_crawler_cluster_restarted = SubDagOperator(
  subdag=sub_dag('check_cluster_slack', 'crawler_dag_cluster_restarted', dag.schedule_interval),
  task_id='crawler_dag_cluster_restarted',
  dag=dag,
)
    
branch1.set_upstream(check_cluster)                                       
send_slack_cluster_ok.set_upstream(branch1)     
send_slack_cluster_start.set_upstream(branch1)
start_cluster.set_upstream(send_slack_cluster_start)
branch2.set_upstream(start_cluster)
send_slack_cluster_down.set_upstream(branch2)
send_slack_cluster_restarted_ok.set_upstream(branch2)
run_etl_crawler_cluster_up.set_upstream(send_slack_cluster_ok)
run_etl_crawler_cluster_restarted.set_upstream(send_slack_cluster_restarted_ok)
示例#43
0
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload prod fire_department_SD.csv file to S3
upload_fd_data = S3FileTransferOperator(
    task_id='upload_fd_data',
    source_base_path=conf['prod_data_dir'],
    source_key='/fd_problems_{}_datasd.csv'.format(cur_yr),
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='fd_cad/' + 'fd_problems_{}_datasd.csv'.format(cur_yr),
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)


update_fire_incidents_problems = get_seaboard_update_dag('fire-incidents-problems.md', dag)
#: Execution order

#: fd_latest_only must run before get_fd_data
get_fd_data.set_upstream(fd_latest_only)

#: upload_fd_data is dependent on successful run of get_fd_data
upload_fd_data.set_upstream(get_fd_data)

#: upload_fd_data must succeed before updating github
update_fire_incidents_problems.set_upstream(upload_fd_data)
示例#44
0
    'retries': 0,
}

dag = DAG(dag_id='credit_card_processor',
          default_args=args,
          schedule_interval=None,
          dagrun_timeout=timedelta(minutes=15))

create_dirs_task = PythonOperator(task_id='create_dirs',
                                  python_callable=create_dirs,
                                  dag=dag)

download_statement_task = PythonOperator(task_id='download_statement',
                                         python_callable=download_statement,
                                         dag=dag)
download_statement_task.set_upstream(create_dirs_task)

wait_for_statement_task = PythonOperator(task_id='wait_for_statement',
                                         python_callable=wait_for_statement,
                                         dag=dag)
wait_for_statement_task.set_upstream(download_statement_task)

open_tabula_task = PythonOperator(task_id='open_tabula',
                                  python_callable=open_tabula,
                                  dag=dag)
open_tabula_task.set_upstream(wait_for_statement_task)

wait_for_csv_task = PythonOperator(task_id='wait_for_csv',
                                   python_callable=wait_for_csv,
                                   dag=dag)
wait_for_csv_task.set_upstream(open_tabula_task)
示例#45
0
#: Create subsets
create_subsets = PythonOperator(
    task_id='create_subsets',
    python_callable=make_prod_files,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_ttcs_md = get_seaboard_update_dag('business-listings.md', dag)

#: Execution Rules

#: ttcs_latest_only must run before get_active
get_active_businesses.set_upstream(ttcs_latest_only)
#: ttcs_latest_only must run before get_bids
clean_data.set_upstream(get_active_businesses)
#: Data cleaning occurs after BIDs data retrieval.
geocode_data.set_upstream(clean_data)
#: spatial join occurs after geocoding.
join_bids.set_upstream(geocode_data)
#: last 3mo subsetting occurs after spatial join
create_subsets.set_upstream(join_bids)

subset_names = [os.path.basename(x) for x in glob.glob(conf['prod_data_dir']+'/sd_businesses_*.csv')]

for index, subset in enumerate(subset_names):

    fname = subset[14:-11]
示例#46
0
   replace=True,
   on_failure_callback=notify,
   on_retry_callback=notify,
   on_success_callback=notify,
   dag=dag)


#: update permits.md file
update_permits_md = get_seaboard_update_dag('permits-dsd.md', dag)


#: Execution rules

#: dsd_permits_latest_only must run before get_permits_files
get_permits_files.set_upstream(dsd_permits_latest_only)

#: clean_data tasks are executed after get_approvals_files task
clean_data.set_upstream(get_permits_files)

#: upload_dsd tasks are executed after clean_data tasks
join_bids.set_upstream(clean_data)

#: upload_dsd tasks are executed after join bids tasks
upload_dsd_permits.set_upstream(join_bids)

#: github updates are executed after S3 upload tasks
update_permits_md.set_upstream(upload_dsd_permits)



fetch_tweets = PythonOperator(
    task_id='fetch_tweets',
    python_callable=fetchtweets,
    dag=dag)

# --------------------------------------------------------------------------------
# Clean the eight files. In this step you can get rid of or cherry pick columns
# and different parts of the text
# --------------------------------------------------------------------------------

clean_tweets = PythonOperator(
    task_id='clean_tweets',
    python_callable=cleantweets,
    dag=dag)

clean_tweets.set_upstream(fetch_tweets)

# --------------------------------------------------------------------------------
# In this section you can use a script to analyze the twitter data. Could simply
# be a sentiment analysis through algorithms like bag of words or something more
# complicated. You can also take a look at Web Services to do such tasks
# --------------------------------------------------------------------------------

analyze_tweets = PythonOperator(
    task_id='analyze_tweets',
    python_callable=analyzetweets,
    dag=dag)

analyze_tweets.set_upstream(clean_tweets)

# --------------------------------------------------------------------------------
示例#48
0

def my_sleeping_function(random_base):
    """This is a function that will run within the DAG execution"""
    time.sleep(random_base)


def print_context(ds, **kwargs):
    pprint(kwargs)
    print(ds)
    return "Whatever you return gets printed in the logs"


run_this = PythonOperator(
    task_id="print_the_context",
    provide_context=True,
    python_callable=print_context,
    dag=dag,
)

# Generate 10 sleeping tasks, sleeping from 0 to 4 seconds respectively
for i in range(5):
    task = PythonOperator(
        task_id="sleep_for_" + str(i),
        python_callable=my_sleeping_function,
        op_kwargs={"random_base": float(i) / 10},
        dag=dag,
    )

    task.set_upstream(run_this)
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('chained_job',
          schedule_interval='@once',
          default_args=default_args)


producer = PythonOperator(
    task_id='run_job_producer',
    python_callable=run_job,
    op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'),
    retries=1,
    dag=dag
)

consumer = PythonOperator(
    task_id='run_job_consumer',
    python_callable=run_job,
    op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'),
    retries=1,
    dag=dag
)


consumer.set_upstream(producer)
示例#50
0
    grouped_partition = grouped_partition.reset_index()
    push_data(grouped_partition, kwargs['dest_file'])
    logging.info('ETL JOBLIFT SUCCESSFULLY RAN.')



dag = DAG('joblift_cpc_ETL',
            description='Simple  tutorial DAG',
            start_date=datetime.now() - timedelta(days=4),
            schedule_interval='0 0 * * *'
         )


load_cpc_data = PythonOperator(task_id='load_cpc_files_data',
                               python_callable=read_files,
                               op_kwargs={'path_folder': PATH_CPC, 'dest_file': FOLDER_PATH_DESTINATION_CPC},
                               dag= dag)

load_rate_data = PythonOperator(task_id= 'load_exchange_rate_api_data',
                               python_callable=get_exchange_rate,
                               op_kwargs={'url': URL, 'start_date': START_DATE, 'end_date': END_DATE, 'folder_dest_file': FOLDER_PATH_DESTINATION_RATE},
                               dag=dag)

Transform_push = PythonOperator(task_id='transform_data',
                               python_callable=transform_data,
                               op_kwargs={'dfp1': FOLDER_PATH_DESTINATION_CPC, 'dfp2': FOLDER_PATH_DESTINATION_RATE, 'dest_file': FOLDER_PATH_DESTINATION_FINAL},
                               dag=dag)


Transform_push.set_upstream([load_rate_data, load_cpc_data])
示例#51
0
#: Upload prod SE file to S3
upload_special_events = S3FileTransferOperator(
    task_id='upload_special_events',
    source_base_path=conf['prod_data_dir'],
    source_key='special_events_list_datasd.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='special_events/special_events_list_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_special_events_md = get_seaboard_update_dag('special-events.md', dag)

#: Execution rules

#: se_latest_only must run before get_special_events
get_special_events.set_upstream(se_latest_only)

#: process_special_events dependent on get_special_events
process_special_events.set_upstream(get_special_events)

#: upload_special_events dependent on process_special_events
upload_special_events.set_upstream(process_special_events)

#: update github modified date after S3 upload
update_special_events_md.set_upstream(upload_special_events)
def escreve_dw():
    final = pd.read_csv(data_path + 'enade_tratado.csv')
    engine = sqlalchemy.create_engine(
        'mysql+pymysql://root:root@localhost/enade')
    final.to_sql('tratado', con=engine, index=False, if_exists='append')


task_escreve_dw = PythonOperator(task_id='escreve_dw',
                                 python_callable=escreve_dw,
                                 dag=dag)

get_data >> unzip_data >> task_aplica_filtro
task_aplica_filtro >> [
    task_idade_cent, task_est_civil, task_cor, task_escopai, task_escomae,
    task_renda
]
# informando que a minha task_idade_quad tem que vir depois de task_idade_cent

task_idade_quad.set_upstream(task_idade_cent)
# task_idade_cent.downstream_list(task_idade_quad) - FAZ A MESMA COISA ACIMA

task_join.set_upstream([
    task_est_civil, task_cor, task_escopai, task_escomae, task_renda,
    task_idade_quad
])

# dizendo que a task_escreve_dw executa depois de task_join
task_join.set_downstream(task_escreve_dw)

# subindo os containers no Docker com 2 workers
# docker-compose up -d --scale worker==2
示例#53
0
    script_location="s3n://public-qubole/qbol-library/scripts/show_table.hql",
    notfiy=True,
    tags=['tag1', 'tag2'],
    # If the script at s3 location has any qubole specific macros to be replaced
    # macros='[{"date": "{{ ds }}"}, {"name" : "abc"}]',
    trigger_rule="all_done",
    dag=dag)

t3 = PythonOperator(
    task_id='compare_result',
    provide_context=True,
    python_callable=compare_result,
    trigger_rule="all_done",
    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)

join = DummyOperator(
    task_id='join',
    trigger_rule='one_success',
    dag=dag
)
示例#54
0
def create_evaluate_ops(
        task_prefix,  # pylint:disable=too-many-arguments
        data_format,
        input_paths,
        prediction_path,
        metric_fn_and_keys,
        validate_fn,
        batch_prediction_job_id=None,
        project_id=None,
        region=None,
        dataflow_options=None,
        model_uri=None,
        model_name=None,
        version_name=None,
        dag=None):
    """
    Creates Operators needed for model evaluation and returns.

    It gets prediction over inputs via Cloud ML Engine BatchPrediction API by
    calling MLEngineBatchPredictionOperator, then summarize and validate
    the result via Cloud Dataflow using DataFlowPythonOperator.

    For details and pricing about Batch prediction, please refer to the website
    https://cloud.google.com/ml-engine/docs/how-tos/batch-predict
    and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/

    It returns three chained operators for prediction, summary, and validation,
    named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation,
    respectively.
    (<prefix> should contain only alphanumeric characters or hyphen.)

    The upstream and downstream can be set accordingly like:
      pred, _, val = create_evaluate_ops(...)
      pred.set_upstream(upstream_op)
      ...
      downstream_op.set_upstream(val)

    Callers will provide two python callables, metric_fn and validate_fn, in
    order to customize the evaluation behavior as they wish.
    - metric_fn receives a dictionary per instance derived from json in the
      batch prediction result. The keys might vary depending on the model.
      It should return a tuple of metrics.
    - validation_fn receives a dictionary of the averaged metrics that metric_fn
      generated over all instances.
      The key/value of the dictionary matches to what's given by
      metric_fn_and_keys arg.
      The dictionary contains an additional metric, 'count' to represent the
      total number of instances received for evaluation.
      The function would raise an exception to mark the task as failed, in a
      case the validation result is not okay to proceed (i.e. to set the trained
      version as default).

    Typical examples are like this:

    def get_metric_fn_and_keys():
        import math  # imports should be outside of the metric_fn below.
        def error_and_squared_error(inst):
            label = float(inst['input_label'])
            classes = float(inst['classes'])  # 0 or 1
            err = abs(classes-label)
            squared_err = math.pow(classes-label, 2)
            return (err, squared_err)  # returns a tuple.
        return error_and_squared_error, ['err', 'mse']  # key order must match.

    def validate_err_and_count(summary):
        if summary['err'] > 0.2:
            raise ValueError('Too high err>0.2; summary=%s' % summary)
        if summary['mse'] > 0.05:
            raise ValueError('Too high mse>0.05; summary=%s' % summary)
        if summary['count'] < 1000:
            raise ValueError('Too few instances<1000; summary=%s' % summary)
        return summary

    For the details on the other BatchPrediction-related arguments (project_id,
    job_id, region, data_format, input_paths, prediction_path, model_uri),
    please refer to MLEngineBatchPredictionOperator too.

    :param task_prefix: a prefix for the tasks. Only alphanumeric characters and
        hyphen are allowed (no underscores), since this will be used as dataflow
        job name, which doesn't allow other characters.
    :type task_prefix: str

    :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP'
    :type data_format: str

    :param input_paths: a list of input paths to be sent to BatchPrediction.
    :type input_paths: list[str]

    :param prediction_path: GCS path to put the prediction results in.
    :type prediction_path: str

    :param metric_fn_and_keys: a tuple of metric_fn and metric_keys:
        - metric_fn is a function that accepts a dictionary (for an instance),
          and returns a tuple of metric(s) that it calculates.
        - metric_keys is a list of strings to denote the key of each metric.
    :type metric_fn_and_keys: tuple of a function and a list[str]

    :param validate_fn: a function to validate whether the averaged metric(s) is
        good enough to push the model.
    :type validate_fn: function

    :param batch_prediction_job_id: the id to use for the Cloud ML Batch
        prediction job. Passed directly to the MLEngineBatchPredictionOperator as
        the job_id argument.
    :type batch_prediction_job_id: str

    :param project_id: the Google Cloud Platform project id in which to execute
        Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['project_id']` will be used.
    :type project_id: str

    :param region: the Google Cloud Platform region in which to execute Cloud ML
        Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['region']` will be used.
    :type region: str

    :param dataflow_options: options to run Dataflow jobs. If None, then the
        `dag`'s `default_args['dataflow_default_options']` will be used.
    :type dataflow_options: dictionary

    :param model_uri: GCS path of the model exported by Tensorflow using
        tensorflow.estimator.export_savedmodel(). It cannot be used with
        model_name or version_name below. See MLEngineBatchPredictionOperator for
        more detail.
    :type model_uri: str

    :param model_name: Used to indicate a model to use for prediction. Can be
        used in combination with version_name, but cannot be used together with
        model_uri. See MLEngineBatchPredictionOperator for more detail. If None,
        then the `dag`'s `default_args['model_name']` will be used.
    :type model_name: str

    :param version_name: Used to indicate a model version to use for prediction,
        in combination with model_name. Cannot be used together with model_uri.
        See MLEngineBatchPredictionOperator for more detail. If None, then the
        `dag`'s `default_args['version_name']` will be used.
    :type version_name: str

    :param dag: The `DAG` to use for all Operators.
    :type dag: airflow.models.DAG

    :returns: a tuple of three operators, (prediction, summary, validation)
    :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator,
                  PythonOperator)
    """

    # Verify that task_prefix doesn't have any special characters except hyphen
    # '-', which is the only allowed non-alphanumeric character by Dataflow.
    if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix):
        raise AirflowException(
            "Malformed task_id for DataFlowPythonOperator (only alphanumeric "
            "and hyphens are allowed but got: " + task_prefix)

    metric_fn, metric_keys = metric_fn_and_keys
    if not callable(metric_fn):
        raise AirflowException("`metric_fn` param must be callable.")
    if not callable(validate_fn):
        raise AirflowException("`validate_fn` param must be callable.")

    if dag is not None and dag.default_args is not None:
        default_args = dag.default_args
        project_id = project_id or default_args.get('project_id')
        region = region or default_args.get('region')
        model_name = model_name or default_args.get('model_name')
        version_name = version_name or default_args.get('version_name')
        dataflow_options = dataflow_options or \
            default_args.get('dataflow_default_options')

    evaluate_prediction = MLEngineBatchPredictionOperator(
        task_id=(task_prefix + "-prediction"),
        project_id=project_id,
        job_id=batch_prediction_job_id,
        region=region,
        data_format=data_format,
        input_paths=input_paths,
        output_path=prediction_path,
        uri=model_uri,
        model_name=model_name,
        version_name=version_name,
        dag=dag)

    metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True))
    evaluate_summary = DataFlowPythonOperator(
        task_id=(task_prefix + "-summary"),
        py_options=["-m"],
        py_file="airflow.gcp.utils.mlengine_prediction_summary",
        dataflow_default_options=dataflow_options,
        options={
            "prediction_path": prediction_path,
            "metric_fn_encoded": metric_fn_encoded,
            "metric_keys": ','.join(metric_keys)
        },
        py_interpreter='python2',
        dag=dag)
    evaluate_summary.set_upstream(evaluate_prediction)

    def apply_validate_fn(*args, **kwargs):
        prediction_path = kwargs["templates_dict"]["prediction_path"]
        scheme, bucket, obj, _, _ = urlsplit(prediction_path)
        if scheme != "gs" or not bucket or not obj:
            raise ValueError(
                "Wrong format prediction_path: {}".format(prediction_path))
        summary = os.path.join(obj.strip("/"), "prediction.summary.json")
        gcs_hook = GoogleCloudStorageHook()
        summary = json.loads(gcs_hook.download(bucket, summary))
        return validate_fn(summary)

    evaluate_validation = PythonOperator(
        task_id=(task_prefix + "-validation"),
        python_callable=apply_validate_fn,
        provide_context=True,
        templates_dict={"prediction_path": prediction_path},
        dag=dag)
    evaluate_validation.set_upstream(evaluate_summary)

    return evaluate_prediction, evaluate_summary, evaluate_validation
示例#55
0
#: Email new committees
send_committee_report = PoseidonEmailWithPythonOperator(
    task_id='send_committee_report',
    to='*****@*****.**',
    subject='Campaign committees update',
    template_id='tem_7xCrDCTyvjMGS9VpBM8rRmwD',
    dispatch_type='sonar_dispatch',
    python_callable=send_comm_report,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Execution rules
#: campaign_fin_latest_only must run before schedule_460A
schedule_460A.set_upstream(campaign_fin_latest_only)
#: schedule_460A must run before schedule_460B1
schedule_460B1.set_upstream(schedule_460A)
#: schedule_460B1 must run before schedule_460C
schedule_460C.set_upstream(schedule_460B1)
#: schedule_460C must run before schedule_460D
schedule_460D.set_upstream(schedule_460C)
#: schedule_460D must run before schedule_sum
schedule_sum.set_upstream(schedule_460D)
#: schedule_sum must run before schedule_497
schedule_497.set_upstream(schedule_sum)
#: schedule_497 must run before schedule_496
schedule_496.set_upstream(schedule_497)
#: schedule_496 must run before combine_schedules
combine_schedules.set_upstream(schedule_496)
#: combine_schedules must run before file upload
示例#56
0
                    suffixes=('Prior', 'Current')).dropna()
    transformed_data = pd.concat([
        data,
        data.loc[:,
                 data.columns.
                 difference(['Date', 'VolumePrior', 'VolumeCurrent'])].div(
                     data.OpenCurrent,
                     axis=0).round(decimals=3).add_suffix('Percent')
    ],
                                 axis=1)
    transformed_data.to_csv(f'/tmp/work/{symbol}.csv')
    print("Data Retrieved!")


# dag
args = {"owner": "Scrape test", "start_date": airflow.utils.dates.days_ago(2)}

dag = DAG(dag_id="scrape_test", default_args=args, schedule_interval=None)

# tasks
BAC_Task = PythonOperator(task_id="pull_BAC_data",
                          python_callable=pull_BAC_data,
                          dag=dag)

AAN_Task = PythonOperator(task_id="pull_AAN_data",
                          python_callable=pull_AAN_data,
                          dag=dag)

# dependencies
BAC_Task.set_upstream(AAN_Task)
示例#57
0
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

make_op_act = PythonOperator(
    task_id='create_operating_act',
    python_callable=create_operating_act,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Execution Rules

#: budget_latest_only must run before get_accounts
get_accounts.set_upstream(budget_latest_only)
#: get_accounts must run before get_refs
get_refs.set_upstream(get_accounts)

#: get_refs must run before get_capital_ptd
get_capital_ptd.set_upstream(get_refs)
#: get_refs must run before get_capital_fy
get_capital_fy.set_upstream(get_refs)
#: get_refs must run before get_operating
get_operating.set_upstream(get_refs)
#: get_refs must run before get_cip_ptd_act
get_cip_ptd_act.set_upstream(get_refs)
#: get_refs must run before get_cip_fy_act
get_cip_fy_act.set_upstream(get_refs)
#: get_refs must run before get_op_act
get_op_act.set_upstream(get_refs)
industry_task = PythonOperator(task_id='update_uqer_industry_info',
                               provide_context=True,
                               python_callable=update_uqer_industry_info,
                               dag=dag)

sw1_adj_industry_task = PythonOperator(task_id='update_sw1_adj_industry',
                                       provide_context=True,
                                       python_callable=update_sw1_adj_industry,
                                       dag=dag)

dx_industry_task = PythonOperator(task_id='update_dx_industry',
                                  provide_context=True,
                                  python_callable=update_dx_industry,
                                  dag=dag)

industry_task.set_upstream(market_task)
sw1_adj_industry_task.set_upstream(industry_task)
dx_industry_task.set_upstream(industry_task)

categories_task = PythonOperator(task_id='update_categories',
                                 provide_context=True,
                                 python_callable=update_category,
                                 dag=dag)

categories_task.set_upstream(sw1_adj_industry_task)

index_task = PythonOperator(task_id='update_uqer_index_components',
                            provide_context=True,
                            python_callable=update_uqer_index_components,
                            dag=dag)
def create_evaluate_ops(task_prefix,
                        data_format,
                        input_paths,
                        prediction_path,
                        metric_fn_and_keys,
                        validate_fn,
                        batch_prediction_job_id=None,
                        project_id=None,
                        region=None,
                        dataflow_options=None,
                        model_uri=None,
                        model_name=None,
                        version_name=None,
                        dag=None):
    """
    Creates Operators needed for model evaluation and returns.

    It gets prediction over inputs via Cloud ML Engine BatchPrediction API by
    calling MLEngineBatchPredictionOperator, then summarize and validate
    the result via Cloud Dataflow using DataFlowPythonOperator.

    For details and pricing about Batch prediction, please refer to the website
    https://cloud.google.com/ml-engine/docs/how-tos/batch-predict
    and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/

    It returns three chained operators for prediction, summary, and validation,
    named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation,
    respectively.
    (<prefix> should contain only alphanumeric characters or hyphen.)

    The upstream and downstream can be set accordingly like:
      pred, _, val = create_evaluate_ops(...)
      pred.set_upstream(upstream_op)
      ...
      downstream_op.set_upstream(val)

    Callers will provide two python callables, metric_fn and validate_fn, in
    order to customize the evaluation behavior as they wish.
    - metric_fn receives a dictionary per instance derived from json in the
      batch prediction result. The keys might vary depending on the model.
      It should return a tuple of metrics.
    - validation_fn receives a dictionary of the averaged metrics that metric_fn
      generated over all instances.
      The key/value of the dictionary matches to what's given by
      metric_fn_and_keys arg.
      The dictionary contains an additional metric, 'count' to represent the
      total number of instances received for evaluation.
      The function would raise an exception to mark the task as failed, in a
      case the validation result is not okay to proceed (i.e. to set the trained
      version as default).

    Typical examples are like this:

    def get_metric_fn_and_keys():
        import math  # imports should be outside of the metric_fn below.
        def error_and_squared_error(inst):
            label = float(inst['input_label'])
            classes = float(inst['classes'])  # 0 or 1
            err = abs(classes-label)
            squared_err = math.pow(classes-label, 2)
            return (err, squared_err)  # returns a tuple.
        return error_and_squared_error, ['err', 'mse']  # key order must match.

    def validate_err_and_count(summary):
        if summary['err'] > 0.2:
            raise ValueError('Too high err>0.2; summary=%s' % summary)
        if summary['mse'] > 0.05:
            raise ValueError('Too high mse>0.05; summary=%s' % summary)
        if summary['count'] < 1000:
            raise ValueError('Too few instances<1000; summary=%s' % summary)
        return summary

    For the details on the other BatchPrediction-related arguments (project_id,
    job_id, region, data_format, input_paths, prediction_path, model_uri),
    please refer to MLEngineBatchPredictionOperator too.

    :param task_prefix: a prefix for the tasks. Only alphanumeric characters and
        hyphen are allowed (no underscores), since this will be used as dataflow
        job name, which doesn't allow other characters.
    :type task_prefix: string

    :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP'
    :type data_format: string

    :param input_paths: a list of input paths to be sent to BatchPrediction.
    :type input_paths: list of strings

    :param prediction_path: GCS path to put the prediction results in.
    :type prediction_path: string

    :param metric_fn_and_keys: a tuple of metric_fn and metric_keys:
        - metric_fn is a function that accepts a dictionary (for an instance),
          and returns a tuple of metric(s) that it calculates.
        - metric_keys is a list of strings to denote the key of each metric.
    :type metric_fn_and_keys: tuple of a function and a list of strings

    :param validate_fn: a function to validate whether the averaged metric(s) is
        good enough to push the model.
    :type validate_fn: function

    :param batch_prediction_job_id: the id to use for the Cloud ML Batch
        prediction job. Passed directly to the MLEngineBatchPredictionOperator as
        the job_id argument.
    :type batch_prediction_job_id: string

    :param project_id: the Google Cloud Platform project id in which to execute
        Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['project_id']` will be used.
    :type project_id: string

    :param region: the Google Cloud Platform region in which to execute Cloud ML
        Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['region']` will be used.
    :type region: string

    :param dataflow_options: options to run Dataflow jobs. If None, then the
        `dag`'s `default_args['dataflow_default_options']` will be used.
    :type dataflow_options: dictionary

    :param model_uri: GCS path of the model exported by Tensorflow using
        tensorflow.estimator.export_savedmodel(). It cannot be used with
        model_name or version_name below. See MLEngineBatchPredictionOperator for
        more detail.
    :type model_uri: string

    :param model_name: Used to indicate a model to use for prediction. Can be
        used in combination with version_name, but cannot be used together with
        model_uri. See MLEngineBatchPredictionOperator for more detail. If None,
        then the `dag`'s `default_args['model_name']` will be used.
    :type model_name: string

    :param version_name: Used to indicate a model version to use for prediciton,
        in combination with model_name. Cannot be used together with model_uri.
        See MLEngineBatchPredictionOperator for more detail. If None, then the
        `dag`'s `default_args['version_name']` will be used.
    :type version_name: string

    :param dag: The `DAG` to use for all Operators.
    :type dag: airflow.DAG

    :returns: a tuple of three operators, (prediction, summary, validation)
    :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator,
                  PythonOperator)
    """

    # Verify that task_prefix doesn't have any special characters except hyphen
    # '-', which is the only allowed non-alphanumeric character by Dataflow.
    if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix):
        raise AirflowException(
            "Malformed task_id for DataFlowPythonOperator (only alphanumeric "
            "and hyphens are allowed but got: " + task_prefix)

    metric_fn, metric_keys = metric_fn_and_keys
    if not callable(metric_fn):
        raise AirflowException("`metric_fn` param must be callable.")
    if not callable(validate_fn):
        raise AirflowException("`validate_fn` param must be callable.")

    if dag is not None and dag.default_args is not None:
        default_args = dag.default_args
        project_id = project_id or default_args.get('project_id')
        region = region or default_args.get('region')
        model_name = model_name or default_args.get('model_name')
        version_name = version_name or default_args.get('version_name')
        dataflow_options = dataflow_options or \
            default_args.get('dataflow_default_options')

    evaluate_prediction = MLEngineBatchPredictionOperator(
        task_id=(task_prefix + "-prediction"),
        project_id=project_id,
        job_id=batch_prediction_job_id,
        region=region,
        data_format=data_format,
        input_paths=input_paths,
        output_path=prediction_path,
        uri=model_uri,
        model_name=model_name,
        version_name=version_name,
        dag=dag)

    metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True))
    evaluate_summary = DataFlowPythonOperator(
        task_id=(task_prefix + "-summary"),
        py_options=["-m"],
        py_file="airflow.contrib.operators.mlengine_prediction_summary",
        dataflow_default_options=dataflow_options,
        options={
            "prediction_path": prediction_path,
            "metric_fn_encoded": metric_fn_encoded,
            "metric_keys": ','.join(metric_keys)
        },
        dag=dag)
    evaluate_summary.set_upstream(evaluate_prediction)

    def apply_validate_fn(*args, **kwargs):
        prediction_path = kwargs["templates_dict"]["prediction_path"]
        scheme, bucket, obj, _, _ = urlsplit(prediction_path)
        if scheme != "gs" or not bucket or not obj:
            raise ValueError("Wrong format prediction_path: %s",
                             prediction_path)
        summary = os.path.join(obj.strip("/"),
                               "prediction.summary.json")
        gcs_hook = GoogleCloudStorageHook()
        summary = json.loads(gcs_hook.download(bucket, summary))
        return validate_fn(summary)

    evaluate_validation = PythonOperator(
        task_id=(task_prefix + "-validation"),
        python_callable=apply_validate_fn,
        provide_context=True,
        templates_dict={"prediction_path": prediction_path},
        dag=dag)
    evaluate_validation.set_upstream(evaluate_summary)

    return evaluate_prediction, evaluate_summary, evaluate_validation
documentum_docs_latest_only = LatestOnlyOperator(task_id='documentum_24_docs_latest_only', dag=dag)

#: Get documentum tables
get_doc_tables = PythonOperator(
    task_id='get_documentum_tables',
    python_callable=get_documentum,
    op_kwargs={'mode': schedule_mode},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Execution rules
#: documentum_docs_latest_only must run before get_doc_tables
get_doc_tables.set_upstream(documentum_docs_latest_only)

files = [f for f in os.listdir(conf['prod_data_dir'])]
tables_other = dn.table_name(schedule_mode)
for f in files:
    file_name = f.split('.')[0]
    name_parts = file_name.split('_')
    if name_parts[0] == "documentum":
        file_check = '_'.join(name_parts[1:]).upper()
        if file_check in tables_other:
            #: Upload onbase prod files to S3
            upload_doc_tables = S3FileTransferOperator(
                task_id='upload_' + file_name,
                source_base_path=conf['prod_data_dir'],
                source_key='{}.csv'.format(file_name),
                dest_s3_conn_id=conf['default_s3_conn_id'],