def test_python_callable_keyword_arguments_are_templatized(self):
        """Test PythonOperator op_kwargs are templatized"""
        recorded_calls = []

        task = PythonOperator(
            task_id='python_operator',
            # a Mock instance cannot be used as a callable function or test fails with a
            # TypeError: Object of type Mock is not JSON serializable
            python_callable=(build_recording_function(recorded_calls)),
            op_kwargs={
                'an_int': 4,
                'a_date': date(2019, 1, 1),
                'a_templated_string': "dag {{dag.dag_id}} ran on {{ds}}."
            },
            dag=self.dag)

        self.dag.create_dagrun(
            run_id='manual__' + DEFAULT_DATE.isoformat(),
            execution_date=DEFAULT_DATE,
            start_date=DEFAULT_DATE,
            state=State.RUNNING
        )
        task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)

        self.assertEqual(1, len(recorded_calls))
        self._assertCallsEqual(
            recorded_calls[0],
            Call(an_int=4,
                 a_date=date(2019, 1, 1),
                 a_templated_string="dag {} ran on {}.".format(
                     self.dag.dag_id, DEFAULT_DATE.date().isoformat()))
        )
Пример #2
0
 def test_python_operator_run(self):
     """Tests that the python callable is invoked on task run."""
     task = PythonOperator(
         python_callable=self.do_run,
         task_id='python_operator',
         dag=self.dag)
     self.assertFalse(self.is_run())
     task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
     self.assertTrue(self.is_run())
    def test_echo_env_variables(self):
        """
        Test that env variables are exported correctly to the
        python callback in the task.
        """
        self.dag.create_dagrun(
            run_id='manual__' + DEFAULT_DATE.isoformat(),
            execution_date=DEFAULT_DATE,
            start_date=DEFAULT_DATE,
            state=State.RUNNING,
            external_trigger=False,
        )

        t = PythonOperator(task_id='hive_in_python_op',
                           dag=self.dag,
                           python_callable=self._env_var_check_callback
                           )
        t.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
Пример #4
0
# All times in Airflow UTC.  Set Start Time in PST?
args = general.args
conf = general.config
schedule = general.schedule['sire']
start_date = general.start_date['sire']

#: Dag spec
dag = DAG(dag_id='sire_docs', default_args=args, start_date=start_date, schedule_interval=schedule)

sire_docs_latest_only = LatestOnlyOperator(task_id='sire_docs_latest_only', dag=dag)

#: Get sire tables
get_doc_tables = PythonOperator(
    task_id='get_sire_tables',
    python_callable=get_sire,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Execution rules
#: sire_docs_latest_only must run before get_doc_tables
get_doc_tables.set_upstream(sire_docs_latest_only)

files = [f for f in os.listdir(conf['prod_data_dir'])]
for f in files:
    if f.split('_')[0] == "sire":
        #: Upload sire prod files to S3
        upload_doc_tables = S3FileTransferOperator(
            task_id='upload_{}'.format(f),
            source_base_path=conf['prod_data_dir'],
def fun_task_timeout_monitor(ds, dag, **op_kwargs):
    dag_ids = dag.dag_id

    msg = [{
        "dag": dag,
        "db": "opay_dw",
        "table": "{dag_name}".format(dag_name=dag_ids),
        "partition": "country_code=NG/dt={pt}".format(pt=ds),
        "timeout": "3000"
    }]

    TaskTimeoutMonitor().set_task_monitor(msg)


task_timeout_monitor = PythonOperator(task_id='task_timeout_monitor',
                                      python_callable=fun_task_timeout_monitor,
                                      provide_context=True,
                                      dag=dag)

##----------------------------------------- 变量 ---------------------------------------##
db_name = "opay_dw"
table_name = "app_opay_transaction_consume_scenario_sum_m"
hdfs_path = "oss://opay-datalake/opay/opay_dw/" + table_name


##---- hive operator ---##
def app_opay_transaction_consume_scenario_sum_m_sql_task(ds):
    HQL = '''
    
    set mapred.max.split.size=1000000;
    set hive.exec.dynamic.partition.mode=nonstrict;
    set hive.exec.parallel=true; --default false
Пример #6
0

#: Get CFS data from FTP and save to temp folder
get_cfs_data = BashOperator(
    task_id='get_cfs_data',
    bash_command=get_cfs_data(),
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Process CFS data and save result to prod folder
process_cfs_data = PythonOperator(
    task_id='process_cfs_data',
    python_callable=process_cfs_data,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload prod file to S3
cfs_to_S3 = S3FileTransferOperator(
    task_id='cfs_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='pd_calls_for_service_'+curr_year+'_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='pd/pd_calls_for_service_'+curr_year+'_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
from datetime import datetime
from airflow import DAG
from airflow.operators.python_operator import PythonOperator

dag_id = "just_say_hello"

with DAG(dag_id=dag_id,
         start_date=datetime(2018, 11, 14),
         schedule_interval=None) as dag:

    def say_hello():
        print("Hello Airflow!")

    PythonOperator(task_id="say_hello", python_callable=say_hello)
Пример #8
0
    'start_date': today - timedelta(days=2),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG('chained_job',
          schedule_interval='@once',
          default_args=default_args)


producer = PythonOperator(
    task_id='run_job_producer',
    python_callable=run_job,
    op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'),
    retries=1,
    dag=dag
)

consumer = PythonOperator(
    task_id='run_job_consumer',
    python_callable=run_job,
    op_args=('75588', 300, '8edd9e11f4de44b39f666777ac79bfe1'),
    retries=1,
    dag=dag
)


consumer.set_upstream(producer)
Пример #9
0
dag = DAG(dag_id='dsd_code_enforcement',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule['dsd_code_enforcement'])

#: Latest Only Operator for dsd code enforcement
dsd_ce_latest_only = LatestOnlyOperator(
    task_id='dsd_code_enf_latest_only', dag=dag)


#: Download code enforcement files and unzip them.
get_code_enf_files = PythonOperator(
    task_id='get_code_enf_files',
    python_callable=dfg.get_files,
    op_kwargs={'fname_list': fname_list,
               'target_dir': dsd_temp_dir},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_code_enf_md = get_seaboard_update_dag('code-enforcement-violations.md', dag)

#: Execution rules
#: dsd_code_enf_latest_only must run before get_code_enf_files
get_code_enf_files.set_upstream(dsd_ce_latest_only)


for i in fname_list:
    #: Create fme shell command
Пример #10
0
    return None  # load new data to mongodb


load_new_data_task = PythonOperator(
    task_id='load_new_data',
    python_callable=load_new_data,
    dag=dag)


def extract_type(ds, **kwargs):
    year, month, day = ds.split('-')  # 2016-04-22
    c_ds = "%s/%s/%s" % (day, month, year)  # 15/12/2014
    count = 0
    tp = kwargs['tp']
    keyword = kwargs['keyword']
    for andamento in Andamentos.objects(data=c_ds):
        texto_lw = andamento.texto.lower()
        if keyword in texto_lw:
            andamento.tipo = tp
            andamento.save()
            count += 1
    return count


for tp in PROGRESS_TYPES:
    extract_tipo_task = PythonOperator(
        task_id='extract_%s_task' % (tp,),
        python_callable=extract_type, op_kwargs={'tp': tp, 'keyword': PROGRESS_TYPES[tp]},
        dag=dag, provide_context=True)
    extract_tipo_task.set_upstream(load_new_data_task)
Пример #11
0
##----------------------------------------- 任务超时监控 ---------------------------------------##

def fun_task_timeout_monitor(ds, dag, **op_kwargs):
    dag_ids = dag.dag_id

    msg = [
        {"dag": dag, "db": "oride_dw", "table": "{dag_name}".format(dag_name=dag_ids),
         "partition": "country_code=NG/dt={pt}".format(pt=ds), "timeout": "800"}
    ]

    TaskTimeoutMonitor().set_task_monitor(msg)


task_timeout_monitor = PythonOperator(
    task_id='task_timeout_monitor',
    python_callable=fun_task_timeout_monitor,
    provide_context=True,
    dag=dag
)


##----------------------------------------- 脚本 ---------------------------------------##

def dwd_oride_assets_sku_df_sql_task(ds):
    HQL = '''


    set hive.exec.parallel=true;
    set hive.exec.dynamic.partition.mode=nonstrict;

    INSERT overwrite TABLE oride_dw.{table} partition(country_code,dt)
Пример #12
0
                                     total_data['申万一级行业'].values, er):
        detail_info[str(code)] = {
            'weight': w,
            'industry': ind,
            'zz500': bm_w,
            'er': r
        }

    portfolio_dict = {'Date': prev_date, 'portfolio': detail_info}

    portfolio_collection.delete_many({'Date': prev_date})
    portfolio_collection.insert_one(portfolio_dict)

    portfolio.to_csv(
        '~/mnt/sharespace/personal/licheng/portfolio/zz500_mutual_fund/{0}.csv'
        .format(prev_date.strftime('%Y-%m-%d')),
        encoding='gbk')

    return 0


run_this1 = PythonOperator(task_id='update_daily_portfolio_mutual_fund',
                           provide_context=True,
                           python_callable=update_daily_portfolio_mutual_fund,
                           dag=dag)

if __name__ == '__main__':
    update_daily_portfolio_mutual_fund(None,
                                       next_execution_date=dt.datetime(
                                           2017, 6, 14))
Пример #13
0
    start_date=start_date,
    schedule_interval=general.schedule['indicator_bacteria_tests'])


#: Latest Only Operator for traffic_counts
wtr_latest_only = LatestOnlyOperator(task_id='water_latest_only', dag=dag)


# TODO - teach me how to be yearly
# Pull out all indicator bac tests.
get_indicator_bac_tests = PythonOperator(
    task_id='get_indicator_bac_tests',
    python_callable=get_indicator_bacteria_tests,
    op_kwargs={
        'date_start': '01-JUN-2014',
        'date_end': (datetime.now() + timedelta(days=5)).strftime('%d-%b-%Y')
    },
    provide_context=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

# Get last bacteria tests for any given point.
get_latest_bac_tests = PythonOperator(
    task_id='get_latest_bac_tests',
    python_callable=get_latest_bac_tests,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)
          # schedule_interval defines the frequency you want to run a dag.
          # In this case every day at 5pm (cron notation)
          schedule_interval='0 17 * * *'
          )


# function called by the PythonOperator when `python_task` will be executed
def print_hello():
    print('Hello world :)')


# binds the tasks to the dag using `with dag` context manager
with dag:
    # task `dummy_task_id` is an instance of DummyOperator.
    # An Operator creates objects that become nodes in the dag.
    dummy_task = DummyOperator(task_id='dummy_task_id',
                               retries=5
                               )

    # task `hello_task_id` is an instance of PythonOperator.
    # PythonOperator executes a Python callable, in this case the
    # `print_hello` function
    python_task = PythonOperator(task_id='hello_task_id',
                                 python_callable=print_hello
                                 )

    # Define tasks dependencies (using the bitshift operator `>>`)
    # tasks execution order: the first to be executed is the dummy_task.
    # If dummy_task succeeded, then `python_task` will be triggered
    dummy_task >> python_task
Пример #15
0

def pull_function(**kwargs):
    ti = kwargs['ti']
    pulled_message = ti.xcom_pull(key='message', task_ids='new_push_task')
    print("Pulled Message: '%s'" % pulled_message)


def new_push_function(**kwargs):
    message = 'This is the NEW pushed message.'
    ti = kwargs['ti']
    ti.xcom_push(key="message", value=message)


t1 = PythonOperator(task_id='push_task',
                    python_callable=push_function,
                    provide_context=True,
                    dag=DAG)

t2 = PythonOperator(task_id='pull_task',
                    python_callable=pull_function,
                    provide_context=True,
                    dag=DAG)

t3 = PythonOperator(task_id='new_push_task',
                    python_callable=new_push_function,
                    provide_context=True,
                    dag=DAG)

t1 >> t3 >> t2
Пример #16
0
import datetime as dt

import airflow
from airflow import DAG

from airflow.operators.python_operator import PythonOperator

default_args = {
    'owner': 'me',
    'start_date': dt.datetime(20, 5, 1),
    'retries': 1,
    'retry_delay': dt.timedelta(minutes=5),
}

def print_world():
    print('world')


with DAG('airflow_tutorial_v01',
         default_args=default_args,
         schedule_interval=None,
         ) as dag:

    print_world = PythonOperator(task_id='print_world',
                                 python_callable=print_world)


print_world
Пример #17
0
    if anything_new:
        return 'yes_generate_notification'
    else:
        return 'no_do_nothing'


def generate_message(**context):
    _, all_comic_info = context['task_instance'].xcom_pull(task_ids='check_comic_info')
    print("產生要寄給 Slack 的訊息內容並存成檔案")


with DAG('comic_app_v2', default_args=default_args, schedule_interval= '@daily') as dag:

    get_read_history = PythonOperator(
        task_id='get_read_history',
        python_callable=process_metadata,
        op_args=['read']
    )

    check_comic_info = PythonOperator(
        task_id='check_comic_info',
        python_callable=check_comic_info,
        provide_context=True
    )

    decide_what_to_do = BranchPythonOperator(
        task_id='new_comic_available',
        python_callable=decide_what_to_do,
        provide_context=True
    )
Пример #18
0
        if errorMessage:
            logging.error('DAP Processor Failure. See Exception')
            raise Exception(errorMessage)

    finally:
        print('Exit Status: {}'.format(exitStatus))
        print('StdOut: {}'.format(stdOutput))
        print('StdErr: {}'.format(errorMessage))

        if sshConn:
            sshConn.close()


# run_DAP_Processor = SSHOperator(
#     task_id = 'Run_DAP_Processor',
#     provide_context = True,
#     ssh_conn_id = 'DAP_App_Server',
#     # command = 'E:\\Airflow_Test\\DAP\DAPConsoleProcessor.exe -config "E:\\Airflow_Test\\DAP\\Configuration\\DAPTCPBrands.xml" -jobname "DAPTCPBrands.xml"',
#     command = 'whoami',
#     timeout = 3600,
#     do_xcom_push = True,
#     get_pty = True,
#     dag = dag
# )

runDAPTask = PythonOperator(task_id='Run_DAP_Processor',
                            provide_context=True,
                            python_callable=runDAP,
                            dag=dag)
Пример #19
0
schedule = general.schedule
start_date = general.start_date['pd_col']

dag = DAG(
    dag_id='pd_col', default_args=args, start_date=start_date, schedule_interval=schedule['pd_col'])


#: Latest Only Operator for pd_col
pd_col_latest_only = LatestOnlyOperator(
    task_id='pd_col_latest_only', dag=dag)

#: Get collisions data from FTP and save to temp folder
get_collisions_data = PythonOperator(
    task_id='get_collisions_data',
    python_callable=get_collisions_data,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Process collisions data and save result to prod folder
process_collisions_data = PythonOperator(
    task_id='process_collisions_data',
    python_callable=process_collisions_data,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload prod file to S3
collisions_to_S3 = S3FileTransferOperator(
dag = DAG('lesson2.exercise3',
          start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
          end_date=datetime.datetime(2019, 1, 1, 0, 0, 0, 0),
          schedule_interval='@monthly',
          max_active_runs=1)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL)

copy_trips_task = PythonOperator(
    task_id='load_trips_from_s3_to_redshift',
    dag=dag,
    python_callable=load_trip_data_to_redshift,
    # TODO: ensure that we provide context to our Python Operator
    provide_context=True,
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_STATIONS_TABLE_SQL,
)

copy_stations_task = PythonOperator(
    task_id='load_stations_from_s3_to_redshift',
    dag=dag,
    python_callable=load_station_data_to_redshift,
    'value': 'airflow'
}]


def print_stuff():
    print("stuff!")


def use_zip_binary():
    rc = os.system("zip")
    assert rc == 0


# You don't have to use any special KubernetesExecutor configuration if you don't want to
start_task = PythonOperator(
    task_id="start_task", python_callable=print_stuff, dag=dag
)

# But you can if you want to
one_task = PythonOperator(
    task_id="one_task", python_callable=print_stuff, dag=dag,
    executor_config={"KubernetesExecutor": {"image": "airflow/ci:latest"}}
)

# Use the zip binary, which is only found in this special docker image
two_task = PythonOperator(
    task_id="two_task", python_callable=use_zip_binary, dag=dag,
    executor_config={"KubernetesExecutor": {"image": "airflow/ci_zip:latest"}}
)

# Limit resources on this operator/task with node affinity & tolerations
Пример #22
0
    hr = moment.hour
    bucket_key_template = f'{source}/{year}/{month}/{day}/ypsource.json'

    get_new_json = S3KeySensor(task_id="get_new_json",
                               poke_interval=60 * 2,
                               timeout=60 * 60 * 3,
                               bucket_key=bucket_key_template,
                               bucket_name=b_name,
                               wildcard_match=False,
                               aws_conn_id="s3_task",
                               dag=dag)

    # get_from_S3 = PythonOperator(
    #     task_id='get_from_S3',
    #     python_callable=get_file_from_s3,
    #     dag=dag
    #     )

    upload_to_S3_task = PythonOperator(
        task_id='upload_file_to_S3',
        python_callable=upload_file_to_S3_with_hook,
        params={
            'filename': '/home/akorede/Documents/mycsv.csv',
            'key': 'mycsv.csv',
            'bucket_name': 'ypsource-bucket',
        },
        provide_context=True,
        dag=dag)

    # Use arrows to set dependencies between tasks
    upload_to_S3_task.set_upstream(get_new_json)
# output files generated by this task and naming convention
# is direction(from or to)_twitterHandle_date.csv
# --------------------------------------------------------------------------------

fetch_tweets = PythonOperator(
    task_id='fetch_tweets',
    python_callable=fetchtweets,
    dag=dag)

# --------------------------------------------------------------------------------
# Clean the eight files. In this step you can get rid of or cherry pick columns
# and different parts of the text
# --------------------------------------------------------------------------------

clean_tweets = PythonOperator(
    task_id='clean_tweets',
    python_callable=cleantweets,
    dag=dag)

clean_tweets.set_upstream(fetch_tweets)

# --------------------------------------------------------------------------------
# In this section you can use a script to analyze the twitter data. Could simply
# be a sentiment analysis through algorithms like bag of words or something more
# complicated. You can also take a look at Web Services to do such tasks
# --------------------------------------------------------------------------------

analyze_tweets = PythonOperator(
    task_id='analyze_tweets',
    python_callable=analyzetweets,
    dag=dag)
Пример #24
0

spark_submit_task = SparkSubmitOperator(
    task_id='spark_submit_job',
    conn_id='spark_default',
    java_class='com.scaledata.softbug.datasources.apache.AccessParser',
    application=EXECUTABLE_PATH,
    # application_args=[' '.join(['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()])],
    # application_args=['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()],
    # application_args=['{0}={1}'.format(k, v) for (k, v) in PARAMS.iteritems()],
    application_args=["{{ti.xcom_pull(task_ids='push_xcom')}}"],
    total_executor_cores='1',
    executor_cores='1',
    executor_memory='2g',
    num_executors='2',
    name='spark-airflow-phoenix',
    verbose=True,
    driver_memory='1g',
    xcom_push='true',
    conf=config,
    dag=dag,
)

dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag)
push_xcom_task = PythonOperator(task_id='push_xcom', python_callable=push_xcom,
                                dag=dag)
pull_xcom_task = PythonOperator(task_id='pull_xcom', python_callable=pull_xcom,
                                templates_dict={'_application_args': PARAMS},
                                dag=dag)

dummy_operator >> push_xcom_task >> pull_xcom_task >> spark_submit_task
    print("annotated!")


def test_volume_mount():
    with open('/foo/volume_mount_test.txt', 'w') as foo:
        foo.write('Hello')

    rc = os.system("cat /foo/volume_mount_test.txt")
    assert rc == 0


# You can use annotations on your kubernetes pods!
start_task = PythonOperator(
    task_id="start_task", python_callable=print_stuff, dag=dag,
    executor_config={
        "KubernetesExecutor": {
            "annotations": {"test": "annotation"}
        }
    }
)

# You can mount volume or secret to the worker pod
second_task = PythonOperator(
    task_id="four_task", python_callable=test_volume_mount, dag=dag,
    executor_config={
        "KubernetesExecutor": {
            "volumes": [
                {
                    "name": "test-volume",
                    "hostPath": {"path": "/tmp/"},
                },
            ],
dag.doc_md = __doc__


def response_check(response):
    """
    Dumps the http response and returns True when the http call status is 200/success
    """
    print(response)
    print(response.text)
    return response.status_code == 200


t2 = SimpleHttpOperator(task_id='heroku_coin',
                        http_conn_id='heroku_conn',
                        method='GET',
                        endpoint='',
                        headers={"Content-Type": "application/json"},
                        xcom_push=True,
                        response_check=response_check,
                        dag=dag)


def print_hello():
    return 'Hello world!'


hello_operator = PythonOperator(task_id='hello_task',
                                python_callable=print_hello,
                                dag=dag)

t2.set_upstream(hello_operator)

def my_sleeping_function(random_base):
    '''This is a function that will run within the DAG execution'''
    time.sleep(random_base)


def print_context(ds, **kwargs):
    pprint(kwargs)
    print(ds)
    return 'Whatever you return gets printed in the logs'

run_this = PythonOperator(
    task_id='print_the_context',
    provide_context=True,
    python_callable=print_context,
    dag=dag)

for i in range(10):
    '''
    Generating 10 sleeping task, sleeping from 0 to 9 seconds
    respectively
    '''
    task = PythonOperator(
        task_id='sleep_for_'+str(i),
        python_callable=my_sleeping_function,
        op_kwargs={'random_base': float(i)/10},
        dag=dag)

    task.set_upstream(run_this)
Пример #28
0
    session = settings.Session()
    for x in OBJECTS_TO_EXPORT:
        result = session.execute(text(x[0]))
        stream_to_S3_fn(result, x[1])

    session.close()

    return "OK"


with DAG(dag_id=dag_id,
         schedule_interval=None,
         catchup=False,
         start_date=days_ago(1)) as dag:

    back_up_activedags_t = PythonOperator(task_id="back_up_activedags",
                                          python_callable=back_up_activedags)
    pause_dags_t = PythonOperator(task_id="pause_dags",
                                  python_callable=pause_dags)
    export_active_dags_t = PythonOperator(task_id="export_active_dags",
                                          python_callable=export_active_dags)
    export_variable_t = PythonOperator(task_id="export_variable",
                                       python_callable=export_variable)

    export_data_t = PythonOperator(task_id="export_data",
                                   python_callable=export_data,
                                   provide_context=True)
    # backup all active dag; pause the dags; export all the tables in the OBJECTS_TO_EXPORT;
    # export the active dags so they can be turned on in the new environment
    # Export variables.
    back_up_activedags_t >> pause_dags_t >> export_data_t
    pause_dags_t >> export_active_dags_t
    dag_id='traffic_counts',
    default_args=args,
    start_date=start_date,
    schedule_interval=schedule)


#: Latest Only Operator for traffic_counts
tc_latest_only = LatestOnlyOperator(
    task_id='traffic_counts_latest_only', dag=dag)


#: Downloads traffic counts xlsx from share
get_traffic_counts = PythonOperator(
    task_id='get_traffic_counts',
    python_callable=get_traffic_counts,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Cleans the downloaded XLSX file, converts it to CSV data.
clean_traffic_counts = PythonOperator(
    task_id='clean_traffic_counts',
    python_callable=clean_traffic_counts,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag
)

#: Builds the prod file
    server.login(username, password)
    server.sendmail(emailfrom, emailto, msg.as_string())
    server.quit()


with DAG('etl_us_marketing_daily_email',
         default_args=default_args,
         schedule_interval='@daily',
         concurrency=10,
         max_active_runs=1) as dag:

    hold_on = TimeSensor(task_id='hold_on',
                         target_time=time(hour=12, minute=30),
                         dag=dag)

    send_email = PythonOperator(
        task_id='generate_and_send_email',
        python_callable=generate_HTML_and_send_email,
        op_args=[],
        # The following is required to pass macros to the PythonOperator
        # See https://stackoverflow.com/a/45870153
        provide_context=True,
        retries=0,
        dag=dag)

    remove_tmp_file = BashOperator(task_id='remove_fb_tmp_file',
                                   bash_command='rm -f {}'.format(name),
                                   dag=dag)

hold_on >> send_email >> remove_tmp_file
Пример #31
0
    df = df.astype('float64')
    df.to_csv('dags/c2k_final.csv')


default_args = {
    'owner': 'Israel Z',
    'start_date': dt.datetime(2018, 5, 9),
    'retries': 1,
    'retry_delay': dt.timedelta(minutes=5),
}

with DAG('flow_pandas',
         default_args=default_args,
         schedule_interval='*/10 * * * *',
         ) as dag:

    download = PythonOperator(task_id='download',
                                 python_callable=download)
    dropn = PythonOperator(task_id='dropn',
                                 python_callable=dropn)
    fill = PythonOperator(task_id='fill',
                                 python_callable=fill)
    cast = PythonOperator(task_id='cast',
                                 python_callable=cast)

# Dependencies

dropn.set_upstream(download)
fill.set_upstream(dropn)
cast.set_upstream(fill)
Пример #32
0
# Use cron to define exact time. Eg. 8:15am would be "15 08 * * *"
schedule_interval = "@daily"
scriptpath = './singer_data/'
# scriptpath = '/media/navneetsajwan/DAC4BFAEC4BF8AF15/Learn-Apache-Airflow-in-easy-way--main/project/singer_data/'
# Define DAG: Set ID and assign default args and schedule interval
dag = DAG(
    'dag_3', 
    default_args=default_args, 
    schedule_interval=schedule_interval
    )

#===============================================================================
# extract data from mysql and store into csv file
t1= PythonOperator(
    task_id='tap_mysql_target_csv',
    python_callable=tap_mysql_target_csv,
    dag=dag,
)

#================================================================================
# create today dir if not exists
t2= PythonOperator(
    task_id='today_dir',
    python_callable=today_dir,
    dag=dag,
)

#================================================================================
# move extrated csv file into today dir folder
t3= PythonOperator(
    task_id='move_files',
DEFAULT_DATE = datetime(2016, 1, 1)
default_args = dict(
    start_date=DEFAULT_DATE,
    owner='airflow')


def fail():
    raise ValueError('Expected failure.')


def success(ti=None, *args, **kwargs):
    if ti.execution_date != DEFAULT_DATE + timedelta(days=1):
        fail()
    return


# DAG tests that tasks ignore all dependencies

dag1 = DAG(dag_id='test_run_ignores_all_dependencies', default_args=dict(depends_on_past=True, **default_args))
dag1_task1 = PythonOperator(
    task_id='test_run_dependency_task',
    python_callable=fail,
    dag=dag1,)
dag1_task2 = PythonOperator(
    task_id='test_run_dependent_task',
    python_callable=success,
    provide_context=True,
    dag=dag1,)
dag1_task1.set_downstream(dag1_task2)
Пример #34
0
import airflow

from airflow.models import DAG
from airflow.operators.python_operator import PythonOperator

args = {"start_date": airflow.utils.dates.days_ago(2), "owner": "royi"}
dag = DAG(dag_id="plugin_other_dag", default_args=args, schedule_interval=None)


def task1():
    print("Hello World!")
    return "Hello World!"


def task2():
    print("Shalom Olam!")
    return "Shalom Olam!"


def task3():
    print("Ola Mundo!")
    return "Ola Mundo!"


op1 = PythonOperator(task_id="task1", python_callable=task1, dag=dag)
op2 = PythonOperator(task_id="task2", python_callable=task2, dag=dag)
op3 = PythonOperator(task_id="task3", python_callable=task3, dag=dag)

op1 >> op2 >> op3
#: Dag spec for dsd approvals
dag = DAG(dag_id='dsd_approvals',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

#: Latest Only Operator for dsd approvals.
dsd_approvals_latest_only = LatestOnlyOperator(
    task_id='dsd_approvals_latest_only', dag=dag)

#: Get most recent weekly permit approvals reports
get_approvals_files = PythonOperator(
    task_id='get_approvals_files',
    python_callable=dfg.get_files,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    op_kwargs={'fname_list': fnames,
               'target_dir': dsd_temp_dir},
    dag=dag)


#: dsd_approvals_latest_only must run before get_approvals_files
get_approvals_files.set_upstream(dsd_approvals_latest_only)

#: update github modified date (solar permits)
update_solar_md = get_seaboard_update_dag('solar-permits.md', dag)

for key in app.approval_dict:

    #: Consolidate weekly permitting data by scraping OpenDSD API
                                csvwriter.writerow(csv_header);

                                for row in campaign_data:
                                    logging.info(row);

                                    csvwriter.writerow(row);
                    except:
                        pass

            except ValueError:
                  pass


        download_data = PythonOperator(
            task_id='task_download_rtbiq_impressions_data'
            ,provide_context=True
            ,python_callable = download_rtbiq_impressions_data
        )

        finish_download = DummyOperator(task_id = 'task_finish_download')
        finish_move_to_client_gcs = DummyOperator(task_id = 'task_finish_move_to_client_gcs')
        finish_upload_to_bq = DummyOperator(task_id = 'task_finish_upload_to_bq')
        complete = DummyOperator(task_id = 'task_complete')

        execution_date = '{{ ds_nodash }}'

        move_to_client_gcs = BashOperator(
            task_id = 'task_move_to_client_gcs'
            ,bash_command='gsutil -m mv ' 
                + 'gs://' + config['gcp_composer_gcs_bucket'] + '/data/rtbiq_data/{0}*'.format(execution_date) + ' ' 
                + 'gs://' + config['gcp_client_gcs_bucket'] + '/data/rtbiq_data/'
Пример #37
0
# All times in Airflow UTC.  Set Start Time in PST?
args = general.args
conf = general.config
schedule = general.schedule['public_art']
start_date = general.start_date['public_art']

#: Dag spec
dag = DAG(dag_id='public_art', default_args=args, start_date=start_date, schedule_interval=schedule)

public_art_latest_only = LatestOnlyOperator(task_id='public_art_latest_only', dag=dag)

#: Get public art from NetX, process, output prod file
get_public_art = PythonOperator(
    task_id='get_public_art',
    python_callable=get_public_art,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload prod art file to S3
upload_public_art = S3FileTransferOperator(
    task_id='upload_public_art',
    source_base_path=conf['prod_data_dir'],
    source_key='public_art_locations_datasd.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='public_art/public_art_locations_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
Пример #38
0
    "eamil_on_retry": True,
    "retries": 0,
    "retry_delay": timedelta(minutes=5)
}

dag = DAG(
    "project_pipeline",
    description="Building the entire project",
    # train every first day of the month
    schedule_interval="@monthly",
    default_args=default_args,
    catchup=False)

with dag:
    task_1_create_base_features = PythonOperator(
        task_id="generate_base_features",
        python_callable=Generate_base_features)

    task_2_create_historical_features = PythonOperator(
        task_id="generate_historic_features",
        python_callable=Generate_historical_features)

    task_3_create_advanced_features = PythonOperator(
        task_id="generate_advanced_features",
        python_callable=Generate_advanced_features)

    task_4_select_features = PythonOperator(task_id="feature_selection",
                                            python_callable=Feature_selection)

    task_5_train_lgb_model = PythonOperator(task_id="train_lgb_model",
                                            python_callable=Train_LGB_Model)
Пример #39
0
schedule = general.schedule['fd_incidents']
start_date = general.start_date['fd_incidents']
cur_yr = general.get_year()

#: Dag spec
dag = DAG(dag_id='fd_problem_nature', default_args=args, start_date=start_date, schedule_interval=schedule)

#: Latest Only Operator for fd
fd_latest_only = LatestOnlyOperator(task_id='fd_latest_only', dag=dag)


#: Get fire_department data from DB
get_fd_data = PythonOperator(
    task_id='get_fd_data',
    python_callable=get_fd_data,
    provide_context=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload prod fire_department_SD.csv file to S3
upload_fd_data = S3FileTransferOperator(
    task_id='upload_fd_data',
    source_base_path=conf['prod_data_dir'],
    source_key='/fd_problems_{}_datasd.csv'.format(cur_yr),
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='fd_cad/' + 'fd_problems_{}_datasd.csv'.format(cur_yr),
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
Пример #40
0
    return f"Data sending completed"


with DAG(
        dag_id=f'populate_{Variable.get("postal",2650)}_estates',
        description=f"Populate estates to {TABLE_NAME}",
        default_args=args,
        # Start 10 minutes ago # days_ago(2)
        start_date=datetime.now(),
        schedule_interval="30 23 * * 1-5",
) as dag:

    push_bolig_postal = PythonOperator(
        task_id=f'load_{Variable.get("postal",2650)}_bolig_data',
        python_callable=get_bolig,
        op_args=[
            Variable.get("postal", 2650),
        ],
        dag=dag,
        provide_context=True,
    )

    process_completed = PythonOperator(
        task_id="mining_completed",
        dag=dag,
        python_callable=process_completed,
        provide_context=True,
    )

push_bolig_postal >> process_completed
Пример #41
0
start_date = general.start_date['ttcs']


#: Dag definition
dag = DAG(dag_id='ttcs', default_args=args, start_date=start_date, schedule_interval=schedule['ttcs'])


#: Latest Only Operator for ttcs
ttcs_latest_only = LatestOnlyOperator(
    task_id='ttcs_latest_only', dag=dag)

#: Get active businesses and save as .csv to temp folder
get_active_businesses = PythonOperator(
    task_id='get_active_businesses',
    python_callable=get_active_businesses,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Process temp data and save as .csv to prod folder
clean_data = PythonOperator(
    task_id='clean_data',
    python_callable=clean_data,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Geocode new entries and update production file
geocode_data = PythonOperator(
Пример #42
0
    start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0),
    end_date=datetime.datetime(2019, 1, 1, 0, 0, 0, 0),
    schedule_interval="@monthly",
    max_active_runs=1,
)

create_trips_table = PostgresOperator(
    task_id="create_trips_table",
    dag=dag,
    postgres_conn_id="redshift",
    sql=sql_statements.CREATE_TRIPS_TABLE_SQL,
)

copy_trips_task = PythonOperator(
    task_id="load_trips_from_s3_to_redshift",
    dag=dag,
    python_callable=load_trip_data_to_redshift,
    provide_context=True,
)

check_trips = PythonOperator(
    task_id="check_trips_data",
    dag=dag,
    python_callable=check_greater_than_zero,
    provide_context=True,
    params={"table": "trips"},
)

create_stations_table = PostgresOperator(
    task_id="create_stations_table",
    dag=dag,
    postgres_conn_id="redshift",
Пример #43
0
    # pushes an XCom without a specific target, just by returning it
    return value_2


def puller(**kwargs):
    ti = kwargs['ti']

    # get value_1
    v1 = ti.xcom_pull(key=None, task_ids='push')
    assert v1 == value_1

    # get value_2
    v2 = ti.xcom_pull(task_ids='push_by_returning')
    assert v2 == value_2

    # get both value_1 and value_2
    v1, v2 = ti.xcom_pull(key=None, task_ids=['push', 'push_by_returning'])
    assert (v1, v2) == (value_1, value_2)


push1 = PythonOperator(
    task_id='push', dag=dag, python_callable=push)

push2 = PythonOperator(
    task_id='push_by_returning', dag=dag, python_callable=push_by_returning)

pull = PythonOperator(
    task_id='puller', dag=dag, python_callable=puller)

pull.set_upstream([push1, push2])
Пример #44
0
start_operator = DummyOperator(task_id='Begin_execution', dag=dag)

# Create set set of TABLES in target Redshift Cluster / Postgres dtabase
create_tables = PostgresOperator(task_id="create_redshift_tables",
                                 dag=dag,
                                 postgres_conn_id="redshift",
                                 sql='/sql_statements/create_tables.sql')

############# Load files to S3
# Load GNIS database file to S3
gnis_to_s3 = PythonOperator(task_id='gnis_to_s3',
                            dag=dag,
                            python_callable=load_to_s3,
                            provide_context=True,
                            op_kwargs={
                                'location': 'tmp_data',
                                'filename': 'NationalFile_20200301.txt',
                                's3_bucket': 'dend-lake',
                                's3_key': 'gnis',
                                'aws_credentials_id': 'aws_credentials',
                            })
############# Load S3 -> Staging Tables
# Load GNIS table to a staging table in Redshift
staging_gnis_2_redshift = PythonOperator(
    task_id='staging_gnis_2_redshift',
    dag=dag,
    python_callable=load_data_to_redshift,
    provide_context=True,
    op_kwargs={
        's3_location': "s3://dend-lake/gnis/NationalFile_20200301.txt",
        'target_table': 'gnis_staging',
dag6_task1 = DummyOperator(
    task_id='test_depends_on_past',
    depends_on_past=True,
    dag=dag6,)
dag6_task2 = DummyOperator(
    task_id='test_depends_on_past_2',
    depends_on_past=True,
    dag=dag6,)
dag6_task2.set_upstream(dag6_task1)


# DAG tests that a deadlocked subdag is properly caught
dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args)
subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args)
subdag7_task1 = PythonOperator(
    task_id='test_subdag_fail',
    dag=subdag7,
    python_callable=fail)
subdag7_task2 = DummyOperator(
    task_id='test_subdag_dummy_1',
    dag=subdag7,)
subdag7_task3 = DummyOperator(
    task_id='test_subdag_dummy_2',
    dag=subdag7)
dag7_subdag1 = SubDagOperator(
    task_id='subdag',
    dag=dag7,
    subdag=subdag7)
subdag7_task1.set_downstream(subdag7_task2)
subdag7_task2.set_downstream(subdag7_task3)

# DAG tests that a Dag run that doesn't complete but has a root failure is marked running
            session.delete(entry)
        logging.info("Finished Performing Delete")
    else:
        logging.warn("You're opted to skip deleting the db entries!!!")

    logging.info("Finished Running Cleanup Process")

with DAG(
    DAG_ID,
    default_args=default_args,
    schedule_interval=SCHEDULE_INTERVAL,
    start_date=START_DATE
) as dag:

    close_session = PythonOperator(
        task_id='close_session',
        python_callable=close_session_function,
    )

    print_configuration = PythonOperator(
        task_id='print_configuration',
        python_callable=print_configuration_function,
        provide_context=True,
    )

    for db_object in DATABASE_OBJECTS:

        cleanup = PythonOperator(
            task_id='cleanup_' + str(db_object["airflow_db_model"].__name__),
            python_callable=cleanup_function,
            params=db_object,
            provide_context=True,
Пример #47
0
#: Dag spec for dsd permits
dag = DAG(dag_id='dsd_permits',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)

#: Latest Only Operator for dsd permits.
dsd_permits_latest_only = LatestOnlyOperator(
    task_id='dsd_permits_latest_only', dag=dag)

#: Get permits reports
get_permits_files = PythonOperator(
    task_id='get_permits_files',
    python_callable=get_permits_files,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Clean permits reports
clean_data = PythonOperator(
    task_id='clean_data',
    python_callable=clean_data,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Join BIDs to permits
join_bids = PythonOperator(
Пример #48
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2019, 7, 22, 5, 0),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 0,
    'retry_delay': timedelta(minutes=5),
    'concurrency': 1
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

with DAG('instances_auto_on',
         catchup=False,
         default_args=default_args,
         schedule_interval="0 8 * * *") as dag:
    opr_startup = PythonOperator(task_id='startup',
                                 python_callable=main,
                                 op_kwargs={
                                     'aws_acces_key_id':
                                     '{{ aws_access_key_id }}',
                                     'aws_secret_access_key':
                                     '{{ aws_secret_key }}',
                                     'aws_region': '{{ aws_region }}',
                                     'role_arn': '{{ aws_role_arn }}',
                                     'direct_login': True
                                 })
Пример #49
0

check_updates_with_judges_task = PythonOperator(
    task_id='check_updates_with_judges',
    python_callable=check_updates_with_judges,
    dag=dag)


def extract_name():
    # TODO: Criar função para extrair o nome do juiz do texto
    return None  # http://blog.yhat.com/posts/named-entities-in-law-and-order-using-nlp.html


def check_name():
    # TODO: Verificar o nome extraido
    return None  # Validar com uma base de nomes de JUIZES (portal da transparencia)


extract_name_task = PythonOperator(
    task_id='extract_name_task',
    python_callable=extract_name,
    dag=dag)

check_name_task = PythonOperator(
    task_id='check_name_task',
    python_callable=check_name,
    dag=dag)

extract_name_task.set_upstream(check_updates_with_judges_task)
check_name_task.set_upstream(extract_name_task)
def create_evaluate_ops(task_prefix,
                        data_format,
                        input_paths,
                        prediction_path,
                        metric_fn_and_keys,
                        validate_fn,
                        batch_prediction_job_id=None,
                        project_id=None,
                        region=None,
                        dataflow_options=None,
                        model_uri=None,
                        model_name=None,
                        version_name=None,
                        dag=None):
    """
    Creates Operators needed for model evaluation and returns.

    It gets prediction over inputs via Cloud ML Engine BatchPrediction API by
    calling MLEngineBatchPredictionOperator, then summarize and validate
    the result via Cloud Dataflow using DataFlowPythonOperator.

    For details and pricing about Batch prediction, please refer to the website
    https://cloud.google.com/ml-engine/docs/how-tos/batch-predict
    and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/

    It returns three chained operators for prediction, summary, and validation,
    named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation,
    respectively.
    (<prefix> should contain only alphanumeric characters or hyphen.)

    The upstream and downstream can be set accordingly like:
      pred, _, val = create_evaluate_ops(...)
      pred.set_upstream(upstream_op)
      ...
      downstream_op.set_upstream(val)

    Callers will provide two python callables, metric_fn and validate_fn, in
    order to customize the evaluation behavior as they wish.
    - metric_fn receives a dictionary per instance derived from json in the
      batch prediction result. The keys might vary depending on the model.
      It should return a tuple of metrics.
    - validation_fn receives a dictionary of the averaged metrics that metric_fn
      generated over all instances.
      The key/value of the dictionary matches to what's given by
      metric_fn_and_keys arg.
      The dictionary contains an additional metric, 'count' to represent the
      total number of instances received for evaluation.
      The function would raise an exception to mark the task as failed, in a
      case the validation result is not okay to proceed (i.e. to set the trained
      version as default).

    Typical examples are like this:

    def get_metric_fn_and_keys():
        import math  # imports should be outside of the metric_fn below.
        def error_and_squared_error(inst):
            label = float(inst['input_label'])
            classes = float(inst['classes'])  # 0 or 1
            err = abs(classes-label)
            squared_err = math.pow(classes-label, 2)
            return (err, squared_err)  # returns a tuple.
        return error_and_squared_error, ['err', 'mse']  # key order must match.

    def validate_err_and_count(summary):
        if summary['err'] > 0.2:
            raise ValueError('Too high err>0.2; summary=%s' % summary)
        if summary['mse'] > 0.05:
            raise ValueError('Too high mse>0.05; summary=%s' % summary)
        if summary['count'] < 1000:
            raise ValueError('Too few instances<1000; summary=%s' % summary)
        return summary

    For the details on the other BatchPrediction-related arguments (project_id,
    job_id, region, data_format, input_paths, prediction_path, model_uri),
    please refer to MLEngineBatchPredictionOperator too.

    :param task_prefix: a prefix for the tasks. Only alphanumeric characters and
        hyphen are allowed (no underscores), since this will be used as dataflow
        job name, which doesn't allow other characters.
    :type task_prefix: str

    :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP'
    :type data_format: str

    :param input_paths: a list of input paths to be sent to BatchPrediction.
    :type input_paths: list[str]

    :param prediction_path: GCS path to put the prediction results in.
    :type prediction_path: str

    :param metric_fn_and_keys: a tuple of metric_fn and metric_keys:
        - metric_fn is a function that accepts a dictionary (for an instance),
          and returns a tuple of metric(s) that it calculates.
        - metric_keys is a list of strings to denote the key of each metric.
    :type metric_fn_and_keys: tuple of a function and a list[str]

    :param validate_fn: a function to validate whether the averaged metric(s) is
        good enough to push the model.
    :type validate_fn: function

    :param batch_prediction_job_id: the id to use for the Cloud ML Batch
        prediction job. Passed directly to the MLEngineBatchPredictionOperator as
        the job_id argument.
    :type batch_prediction_job_id: str

    :param project_id: the Google Cloud Platform project id in which to execute
        Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['project_id']` will be used.
    :type project_id: str

    :param region: the Google Cloud Platform region in which to execute Cloud ML
        Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['region']` will be used.
    :type region: str

    :param dataflow_options: options to run Dataflow jobs. If None, then the
        `dag`'s `default_args['dataflow_default_options']` will be used.
    :type dataflow_options: dictionary

    :param model_uri: GCS path of the model exported by Tensorflow using
        tensorflow.estimator.export_savedmodel(). It cannot be used with
        model_name or version_name below. See MLEngineBatchPredictionOperator for
        more detail.
    :type model_uri: str

    :param model_name: Used to indicate a model to use for prediction. Can be
        used in combination with version_name, but cannot be used together with
        model_uri. See MLEngineBatchPredictionOperator for more detail. If None,
        then the `dag`'s `default_args['model_name']` will be used.
    :type model_name: str

    :param version_name: Used to indicate a model version to use for prediction,
        in combination with model_name. Cannot be used together with model_uri.
        See MLEngineBatchPredictionOperator for more detail. If None, then the
        `dag`'s `default_args['version_name']` will be used.
    :type version_name: str

    :param dag: The `DAG` to use for all Operators.
    :type dag: airflow.models.DAG

    :returns: a tuple of three operators, (prediction, summary, validation)
    :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator,
                  PythonOperator)
    """

    # Verify that task_prefix doesn't have any special characters except hyphen
    # '-', which is the only allowed non-alphanumeric character by Dataflow.
    if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix):
        raise AirflowException(
            "Malformed task_id for DataFlowPythonOperator (only alphanumeric "
            "and hyphens are allowed but got: " + task_prefix)

    metric_fn, metric_keys = metric_fn_and_keys
    if not callable(metric_fn):
        raise AirflowException("`metric_fn` param must be callable.")
    if not callable(validate_fn):
        raise AirflowException("`validate_fn` param must be callable.")

    if dag is not None and dag.default_args is not None:
        default_args = dag.default_args
        project_id = project_id or default_args.get('project_id')
        region = region or default_args.get('region')
        model_name = model_name or default_args.get('model_name')
        version_name = version_name or default_args.get('version_name')
        dataflow_options = dataflow_options or \
            default_args.get('dataflow_default_options')

    evaluate_prediction = MLEngineBatchPredictionOperator(
        task_id=(task_prefix + "-prediction"),
        project_id=project_id,
        job_id=batch_prediction_job_id,
        region=region,
        data_format=data_format,
        input_paths=input_paths,
        output_path=prediction_path,
        uri=model_uri,
        model_name=model_name,
        version_name=version_name,
        dag=dag)

    metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True))
    evaluate_summary = DataFlowPythonOperator(
        task_id=(task_prefix + "-summary"),
        py_options=["-m"],
        py_file="airflow.contrib.operators.mlengine_prediction_summary",
        dataflow_default_options=dataflow_options,
        options={
            "prediction_path": prediction_path,
            "metric_fn_encoded": metric_fn_encoded,
            "metric_keys": ','.join(metric_keys)
        },
        dag=dag)
    evaluate_summary.set_upstream(evaluate_prediction)

    def apply_validate_fn(*args, **kwargs):
        prediction_path = kwargs["templates_dict"]["prediction_path"]
        scheme, bucket, obj, _, _ = urlsplit(prediction_path)
        if scheme != "gs" or not bucket or not obj:
            raise ValueError("Wrong format prediction_path: %s",
                             prediction_path)
        summary = os.path.join(obj.strip("/"), "prediction.summary.json")
        gcs_hook = GoogleCloudStorageHook()
        summary = json.loads(gcs_hook.download(bucket, summary))
        return validate_fn(summary)

    evaluate_validation = PythonOperator(
        task_id=(task_prefix + "-validation"),
        python_callable=apply_validate_fn,
        provide_context=True,
        templates_dict={"prediction_path": prediction_path},
        dag=dag)
    evaluate_validation.set_upstream(evaluate_summary)

    return evaluate_prediction, evaluate_summary, evaluate_validation
Пример #51
0
#: Dag spec
dag = DAG(dag_id='special_events',
          default_args=args,
          start_date=start_date,
          schedule_interval=schedule)


#: Latest Only Operator for special events
se_latest_only = LatestOnlyOperator(task_id='se_latest_only', dag=dag)


#: Get special events from DB
get_special_events = PythonOperator(
    task_id='get_special_events',
    python_callable=get_special_events,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Process and geocode raw special events file
process_special_events = PythonOperator(
    task_id='process_special_events',
    python_callable=process_special_events,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload prod SE file to S3
upload_special_events = S3FileTransferOperator(
Пример #52
0
dag = DAG('scripts', default_args=default_args, schedule_interval=timedelta(days=1))


def print_thing(thing, **kwargs):
    print(f'{kwargs.get("ds")}: {thing}')
    pprint(kwargs)


def print_five():
    for i in range(5):
        print(i)


t1 = PythonOperator(
    task_id='print_thing',
    python_callable=print_thing,
    provide_context=True,
    op_kwargs=dict(
        thing='something',
    ),
    dag=dag,
)

t2 = PythonOperator(
    task_id='print_five',
    python_callable=print_five,
    dag=dag,
)

t1 >> t2
Пример #53
0
    dag=dag)

t2 = QuboleOperator(
    task_id='hive_s3_location',
    command_type="hivecmd",
    script_location="s3n://public-qubole/qbol-library/scripts/show_table.hql",
    notfiy=True,
    tags=['tag1', 'tag2'],
    # If the script at s3 location has any qubole specific macros to be replaced
    # macros='[{"date": "{{ ds }}"}, {"name" : "abc"}]',
    trigger_rule="all_done",
    dag=dag)

t3 = PythonOperator(
    task_id='compare_result',
    provide_context=True,
    python_callable=compare_result,
    trigger_rule="all_done",
    dag=dag)

t3.set_upstream(t1)
t3.set_upstream(t2)

options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd']

branching = BranchPythonOperator(
    task_id='branching',
    python_callable=lambda: random.choice(options),
    dag=dag)
branching.set_upstream(t3)

join = DummyOperator(
Пример #54
0
    part1 = MIMEText(subtit1, 'html')
    part2 = MIMEText(dados1, 'html')
    part3 = MIMEText(subtit2, 'html')
    part4 = MIMEText(dados2, 'html')
    part5 = MIMEText(subtit3, 'html')
    part6 = MIMEText(dados3, 'html')

    message.attach(sumario)
    message.attach(titulo)
    message.attach(registros)
    message.attach(part1)
    message.attach(part2)
    message.attach(part3)
    message.attach(part4)
    message.attach(part5)
    message.attach(part6)

    # conectaremos de forma segura usando SSL
    server = smtplib.SMTP_SSL(smtp_ssl_host, smtp_ssl_port)
    # para interagir com um servidor externo precisaremos
    # fazer login nele
    server.login(username, password)
    server.sendmail(from_addr, to_addrs, message.as_string())
    server.quit()


t1 = PythonOperator(task_id='popula_relatorios',
                    python_callable=carrega_dados,
                    dag=dag)

t1
Пример #55
0
args = general.args
conf = general.config
schedule = general.schedule['campaign_fin']
start_date = general.start_date['campaign_fin']
cur_yr = general.get_year()

#: Dag spec
dag = DAG(dag_id='campaign_fin_reports', default_args=args, start_date=start_date, schedule_interval=schedule)

campaign_fin_latest_only = LatestOnlyOperator(task_id='campaign_fin_latest_only', dag=dag)

#: Get 460A transactions
schedule_460A = PythonOperator(
    task_id='get_transactions_a',
    python_callable=get_transactions_a,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Get 460B1 transactions
schedule_460B1 = PythonOperator(
    task_id='get_transactions_b',
    python_callable=get_transactions_b,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Get 460C transactions
schedule_460C = PythonOperator(
Пример #56
0
def MakeCommonDag(name='istio_daily_flow_test',
                  schedule_interval='15 9 * * *',
                  monthly=False):
    """Creates the shared part of the daily/monthly dags."""
    common_dag = DAG(
        name,
        default_args=default_args,
        schedule_interval=schedule_interval,
    )

    def AirflowGetVariableOrBaseCase(var, base):
        try:
            return Variable.get(var)
        except KeyError:
            return base

    def GenerateTestArgs(**kwargs):
        """Loads the configuration that will be used for this Iteration."""
        conf = kwargs['dag_run'].conf
        if conf is None:
            conf = dict()
        """ Airflow gives the execution date when the job is supposed to be run,
        however we dont backfill and only need to run one build therefore use
        the current date instead of the date that is passed in """
        #    date = kwargs['execution_date']
        date = datetime.datetime.now()

        timestamp = time.mktime(date.timetuple())

        # Monthly releases started in Nov 2017 with 0.3.0, so minor is # of months
        # from Aug 2017.
        minor_version = (date.year - 2017) * 12 + (date.month - 1) - 7
        major_version = AirflowGetVariableOrBaseCase('major_version', 0)
        # This code gets information about the latest released version so we know
        # What version number to use for this round.
        r_minor = int(AirflowGetVariableOrBaseCase('released_version_minor',
                                                   0))
        r_patch = int(AirflowGetVariableOrBaseCase('released_version_patch',
                                                   0))
        # If  we have already released a monthy for this mounth then bump
        # The patch number for the remander of the month.
        if r_minor == minor_version:
            patch = r_patch + 1
        else:
            patch = 0
        # If version is overriden then we should use it otherwise we use it's
        # default or monthly value.
        version = conf.get('VERSION')
        if monthly and not version:
            version = '{}.{}.{}'.format(major_version, minor_version, patch)

        default_conf = environment_config.get_airflow_config(
            version,
            timestamp,
            major=major_version,
            minor=minor_version,
            patch=patch,
            date=date.strftime('%Y%m%d'),
            rc=date.strftime('%H-%M-%S'))
        config_settings = dict(VERSION=default_conf['VERSION'])
        config_settings_name = [
            'PROJECT_ID',
            'MFEST_URL',
            'MFEST_FILE',
            'GCS_STAGING_BUCKET',
            'SVC_ACCT',
            'GITHUB_ORG',
            'GITHUB_REPO',
            'GCS_GITHUB_PATH',
            'TOKEN_FILE',
            'GCR_STAGING_DEST',
            'GCR_RELEASE_DEST',
            'GCS_MONTHLY_RELEASE_PATH',
            'DOCKER_HUB',
            'GCS_BUILD_BUCKET',
            'RELEASE_PROJECT_ID',
        ]

        for name in config_settings_name:
            config_settings[name] = conf.get(name) or default_conf[name]

        if monthly:
            config_settings['MFEST_COMMIT'] = conf.get(
                'MFEST_COMMIT') or Variable.get('latest_sha')
            gcs_path = conf.get('GCS_MONTHLY_STAGE_PATH')
            if not gcs_path:
                gcs_path = default_conf['GCS_MONTHLY_STAGE_PATH']
        else:
            config_settings['MFEST_COMMIT'] = conf.get(
                'MFEST_COMMIT') or default_conf['MFEST_COMMIT']
            gcs_path = conf.get(
                'GCS_DAILY_PATH') or default_conf['GCS_DAILY_PATH']

        config_settings['GCS_STAGING_PATH'] = gcs_path
        config_settings['GCS_BUILD_PATH'] = '{}/{}'.format(
            config_settings['GCS_BUILD_BUCKET'], gcs_path)
        config_settings[
            'GCS_RELEASE_TOOLS_PATH'] = '{}/release-tools/{}'.format(
                config_settings['GCS_BUILD_BUCKET'], gcs_path)
        config_settings['GCS_FULL_STAGING_PATH'] = '{}/{}'.format(
            config_settings['GCS_STAGING_BUCKET'], gcs_path)
        config_settings['ISTIO_REPO'] = 'https://github.com/{}/{}.git'.format(
            config_settings['GITHUB_ORG'], config_settings['GITHUB_REPO'])

        return config_settings

    generate_flow_args = PythonOperator(
        task_id='generate_workflow_args',
        python_callable=GenerateTestArgs,
        provide_context=True,
        dag=common_dag,
    )

    get_git_commit_cmd = """
    {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %}
    git config --global user.name "TestRunnerBot"
    git config --global user.email "*****@*****.**"
    git clone {{ settings.MFEST_URL }} green-builds || exit 2
    pushd green-builds
    git checkout {{ settings.MFEST_COMMIT }} || exit 5
    SHA=`grep {{ settings.GITHUB_ORG }}/{{ settings.GITHUB_REPO }} {{ settings.MFEST_FILE }} | cut -f 6 -d \\"` || exit 3
    if [ -z ${SHA} ]; then
      echo "SHA not found"
      exit 6
    fi
    popd
    git clone {{ settings.ISTIO_REPO }} istio-code
    pushd istio-code/release
    git checkout ${SHA} || exit 4
    gsutil cp *.sh gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/
    gsutil cp *.json gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/
    popd
    pushd green-builds
    git rev-parse HEAD
    """

    get_git_commit = BashOperator(task_id='get_git_commit',
                                  bash_command=get_git_commit_cmd,
                                  xcom_push=True,
                                  dag=common_dag)

    build_template = """
    {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %}
    {% set m_commit = task_instance.xcom_pull(task_ids='get_git_commit') %}
    gsutil cp gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/*.json .
    gsutil cp gs://{{ settings.GCS_RELEASE_TOOLS_PATH }}/data/release/*.sh .
    chmod +x *
    ./start_gcb_build.sh -w -p {{ settings.PROJECT_ID \
    }} -r {{ settings.GCR_STAGING_DEST }} -s {{ settings.GCS_BUILD_PATH }} \
    -v "{{ settings.VERSION }}" \
    -u "{{ settings.MFEST_URL }}" \
    -t "{{ m_commit }}" -m "{{ settings.MFEST_FILE }}" \
    -a {{ settings.SVC_ACCT }}
    """
    # NOTE: if you add commands to build_template after start_gcb_build.sh then take care to preserve its return value

    build = BashOperator(task_id='run_cloud_builder',
                         bash_command=build_template,
                         dag=common_dag)

    test_command = """
    chmod +x /home/airflow/gcs/data/githubctl
    {% set settings = task_instance.xcom_pull(task_ids='generate_workflow_args') %}
    git config --global user.name "TestRunnerBot"
    git config --global user.email "*****@*****.**"
    /home/airflow/gcs/data/githubctl \
    --token_file="{{ settings.TOKEN_FILE }}" \
    --op=dailyRelQual \
    --hub=gcr.io/{{ settings.GCR_STAGING_DEST }} \
    --gcs_path="{{ settings.GCS_BUILD_PATH }}" \
    --tag="{{ settings.VERSION }}"
    """

    run_release_quilification_tests = BashOperator(
        task_id='run_release_quilification_tests',
        bash_command=test_command,
        retries=0,
        dag=common_dag)
    copy_files = GoogleCloudStorageCopyOperator(
        task_id='copy_files_for_release',
        source_bucket=GetSettingTemplate('GCS_BUILD_BUCKET'),
        source_object=GetSettingTemplate('GCS_STAGING_PATH'),
        destination_bucket=GetSettingTemplate('GCS_STAGING_BUCKET'),
        dag=common_dag,
    )
    generate_flow_args >> get_git_commit >> build
    run_release_quilification_tests.set_upstream(build)
    run_release_quilification_tests >> copy_files
    return common_dag, copy_files
Пример #57
0
dag = DAG(
    dag_id='budget',
    default_args=args,
    start_date=start_date,
    schedule_interval=schedule)


#: Latest Only Operator for budget
budget_latest_only = LatestOnlyOperator(
    task_id='budget_latest_only', dag=dag)

get_accounts = PythonOperator(
    task_id='get_chart_of_accounts',
    python_callable=get_accounts_chart,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

get_capital_ptd = PythonOperator(
    task_id='get_capital_ptd',
    python_callable=get_capital_ptd,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

get_capital_fy = PythonOperator(
    task_id='get_capital_fy',
    python_callable=get_capital,
                                       index=False, encoding=encode_set, mode='a')
                total_count += 1
                print('total count: ' + str(total_count))
        count += 1

    # upload s3
    for deal_ymd in year_month_data:
        contract_df_by_api = pd.read_csv(export_file_path + export_file_name + str(deal_ymd) + '.csv')
        contract_df_by_api.to_csv(csv_buffer)
        s3_resource.Object(bucket, export_file_name + str(deal_ymd) + '.csv').put(Body=csv_buffer.getvalue())

    # send kafka broker
    # kafka_producer.produce(contract_df_by_api.to_json(orient='index'))
    # response에 담겨있는 Buckets의 이름만 가져와 buckets 변수에 배열로 저장.


t1 = PythonOperator(task_id='task_1',
                    provide_context=True,
                    python_callable=print_variables,
                    op_kwargs={'input_year': Variable.get("arg_year"),
                               'input_month': Variable.get("arg_month")},
                    dag=dag)

t2 = PythonOperator(task_id='task_2',
                    provide_context=True,
                    python_callable=task_detached_contract_data,
                    op_kwargs={'input_year': Variable.get("arg_year"),
                               'input_month': Variable.get("arg_month")},
                    dag=dag)

t1 >> t2
def create_evaluate_ops(task_prefix,
                        data_format,
                        input_paths,
                        prediction_path,
                        metric_fn_and_keys,
                        validate_fn,
                        batch_prediction_job_id=None,
                        project_id=None,
                        region=None,
                        dataflow_options=None,
                        model_uri=None,
                        model_name=None,
                        version_name=None,
                        dag=None):
    """
    Creates Operators needed for model evaluation and returns.

    It gets prediction over inputs via Cloud ML Engine BatchPrediction API by
    calling MLEngineBatchPredictionOperator, then summarize and validate
    the result via Cloud Dataflow using DataFlowPythonOperator.

    For details and pricing about Batch prediction, please refer to the website
    https://cloud.google.com/ml-engine/docs/how-tos/batch-predict
    and for Cloud Dataflow, https://cloud.google.com/dataflow/docs/

    It returns three chained operators for prediction, summary, and validation,
    named as <prefix>-prediction, <prefix>-summary, and <prefix>-validation,
    respectively.
    (<prefix> should contain only alphanumeric characters or hyphen.)

    The upstream and downstream can be set accordingly like:
      pred, _, val = create_evaluate_ops(...)
      pred.set_upstream(upstream_op)
      ...
      downstream_op.set_upstream(val)

    Callers will provide two python callables, metric_fn and validate_fn, in
    order to customize the evaluation behavior as they wish.
    - metric_fn receives a dictionary per instance derived from json in the
      batch prediction result. The keys might vary depending on the model.
      It should return a tuple of metrics.
    - validation_fn receives a dictionary of the averaged metrics that metric_fn
      generated over all instances.
      The key/value of the dictionary matches to what's given by
      metric_fn_and_keys arg.
      The dictionary contains an additional metric, 'count' to represent the
      total number of instances received for evaluation.
      The function would raise an exception to mark the task as failed, in a
      case the validation result is not okay to proceed (i.e. to set the trained
      version as default).

    Typical examples are like this:

    def get_metric_fn_and_keys():
        import math  # imports should be outside of the metric_fn below.
        def error_and_squared_error(inst):
            label = float(inst['input_label'])
            classes = float(inst['classes'])  # 0 or 1
            err = abs(classes-label)
            squared_err = math.pow(classes-label, 2)
            return (err, squared_err)  # returns a tuple.
        return error_and_squared_error, ['err', 'mse']  # key order must match.

    def validate_err_and_count(summary):
        if summary['err'] > 0.2:
            raise ValueError('Too high err>0.2; summary=%s' % summary)
        if summary['mse'] > 0.05:
            raise ValueError('Too high mse>0.05; summary=%s' % summary)
        if summary['count'] < 1000:
            raise ValueError('Too few instances<1000; summary=%s' % summary)
        return summary

    For the details on the other BatchPrediction-related arguments (project_id,
    job_id, region, data_format, input_paths, prediction_path, model_uri),
    please refer to MLEngineBatchPredictionOperator too.

    :param task_prefix: a prefix for the tasks. Only alphanumeric characters and
        hyphen are allowed (no underscores), since this will be used as dataflow
        job name, which doesn't allow other characters.
    :type task_prefix: string

    :param data_format: either of 'TEXT', 'TF_RECORD', 'TF_RECORD_GZIP'
    :type data_format: string

    :param input_paths: a list of input paths to be sent to BatchPrediction.
    :type input_paths: list of strings

    :param prediction_path: GCS path to put the prediction results in.
    :type prediction_path: string

    :param metric_fn_and_keys: a tuple of metric_fn and metric_keys:
        - metric_fn is a function that accepts a dictionary (for an instance),
          and returns a tuple of metric(s) that it calculates.
        - metric_keys is a list of strings to denote the key of each metric.
    :type metric_fn_and_keys: tuple of a function and a list of strings

    :param validate_fn: a function to validate whether the averaged metric(s) is
        good enough to push the model.
    :type validate_fn: function

    :param batch_prediction_job_id: the id to use for the Cloud ML Batch
        prediction job. Passed directly to the MLEngineBatchPredictionOperator as
        the job_id argument.
    :type batch_prediction_job_id: string

    :param project_id: the Google Cloud Platform project id in which to execute
        Cloud ML Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['project_id']` will be used.
    :type project_id: string

    :param region: the Google Cloud Platform region in which to execute Cloud ML
        Batch Prediction and Dataflow jobs. If None, then the `dag`'s
        `default_args['region']` will be used.
    :type region: string

    :param dataflow_options: options to run Dataflow jobs. If None, then the
        `dag`'s `default_args['dataflow_default_options']` will be used.
    :type dataflow_options: dictionary

    :param model_uri: GCS path of the model exported by Tensorflow using
        tensorflow.estimator.export_savedmodel(). It cannot be used with
        model_name or version_name below. See MLEngineBatchPredictionOperator for
        more detail.
    :type model_uri: string

    :param model_name: Used to indicate a model to use for prediction. Can be
        used in combination with version_name, but cannot be used together with
        model_uri. See MLEngineBatchPredictionOperator for more detail. If None,
        then the `dag`'s `default_args['model_name']` will be used.
    :type model_name: string

    :param version_name: Used to indicate a model version to use for prediciton,
        in combination with model_name. Cannot be used together with model_uri.
        See MLEngineBatchPredictionOperator for more detail. If None, then the
        `dag`'s `default_args['version_name']` will be used.
    :type version_name: string

    :param dag: The `DAG` to use for all Operators.
    :type dag: airflow.DAG

    :returns: a tuple of three operators, (prediction, summary, validation)
    :rtype: tuple(DataFlowPythonOperator, DataFlowPythonOperator,
                  PythonOperator)
    """

    # Verify that task_prefix doesn't have any special characters except hyphen
    # '-', which is the only allowed non-alphanumeric character by Dataflow.
    if not re.match(r"^[a-zA-Z][-A-Za-z0-9]*$", task_prefix):
        raise AirflowException(
            "Malformed task_id for DataFlowPythonOperator (only alphanumeric "
            "and hyphens are allowed but got: " + task_prefix)

    metric_fn, metric_keys = metric_fn_and_keys
    if not callable(metric_fn):
        raise AirflowException("`metric_fn` param must be callable.")
    if not callable(validate_fn):
        raise AirflowException("`validate_fn` param must be callable.")

    if dag is not None and dag.default_args is not None:
        default_args = dag.default_args
        project_id = project_id or default_args.get('project_id')
        region = region or default_args.get('region')
        model_name = model_name or default_args.get('model_name')
        version_name = version_name or default_args.get('version_name')
        dataflow_options = dataflow_options or \
            default_args.get('dataflow_default_options')

    evaluate_prediction = MLEngineBatchPredictionOperator(
        task_id=(task_prefix + "-prediction"),
        project_id=project_id,
        job_id=batch_prediction_job_id,
        region=region,
        data_format=data_format,
        input_paths=input_paths,
        output_path=prediction_path,
        uri=model_uri,
        model_name=model_name,
        version_name=version_name,
        dag=dag)

    metric_fn_encoded = base64.b64encode(dill.dumps(metric_fn, recurse=True))
    evaluate_summary = DataFlowPythonOperator(
        task_id=(task_prefix + "-summary"),
        py_options=["-m"],
        py_file="airflow.contrib.operators.mlengine_prediction_summary",
        dataflow_default_options=dataflow_options,
        options={
            "prediction_path": prediction_path,
            "metric_fn_encoded": metric_fn_encoded,
            "metric_keys": ','.join(metric_keys)
        },
        dag=dag)
    evaluate_summary.set_upstream(evaluate_prediction)

    def apply_validate_fn(*args, **kwargs):
        prediction_path = kwargs["templates_dict"]["prediction_path"]
        scheme, bucket, obj, _, _ = urlsplit(prediction_path)
        if scheme != "gs" or not bucket or not obj:
            raise ValueError("Wrong format prediction_path: %s",
                             prediction_path)
        summary = os.path.join(obj.strip("/"),
                               "prediction.summary.json")
        gcs_hook = GoogleCloudStorageHook()
        summary = json.loads(gcs_hook.download(bucket, summary))
        return validate_fn(summary)

    evaluate_validation = PythonOperator(
        task_id=(task_prefix + "-validation"),
        python_callable=apply_validate_fn,
        provide_context=True,
        templates_dict={"prediction_path": prediction_path},
        dag=dag)
    evaluate_validation.set_upstream(evaluate_summary)

    return evaluate_prediction, evaluate_summary, evaluate_validation
Пример #60
0
import datetime
import logging

from airflow import DAG
from airflow.operators.python_operator import PythonOperator


def hello_world():
    logging.info("Hello World")

#
# TODO: Add a daily `schedule_interval` argument to the following DAG
#
dag = DAG(
        "exercise2",
        start_date=datetime.datetime.now() - datetime.timedelta(days=2))

task = PythonOperator(
        task_id="hello_world_task",
        python_callable=hello_world,
        dag=dag)