Exemplo n.º 1
0
    task_id='chem2_pdb_prot2_pdb', 
    bash_command="(cd /working-directory; virtualScreening.py -l chem2.pdb -o result -p prot2.pdb) ", 
    dag=dag)

chem2_pdb_prot2_pdb_success_mail = EmailOperator(
    task_id="chem2_pdb_prot2_pdb_success_mail", 
    to=[u'*****@*****.**'],  
    subject="chem2_pdb_prot2_pdb success",  
    html_content="chem2_pdb_prot2_pdb success",  
    dag=dag)
                
chem2_pdb_prot2_pdb_success_mail.set_upstream(chem2_pdb_prot2_pdb)
#chem2_pdb_prot2_pdb.set_upstream( )


chem2_pdb_prot3_pdb = BashOperator(
    task_id='chem2_pdb_prot3_pdb', 
    bash_command="(cd /working-directory; virtualScreening.py -l chem2.pdb -o result -p prot3.pdb) ", 
    dag=dag)

chem2_pdb_prot3_pdb_success_mail = EmailOperator(
    task_id="chem2_pdb_prot3_pdb_success_mail", 
    to=[u'*****@*****.**'],  
    subject="chem2_pdb_prot3_pdb success",  
    html_content="chem2_pdb_prot3_pdb success",  
    dag=dag)
                
chem2_pdb_prot3_pdb_success_mail.set_upstream(chem2_pdb_prot3_pdb)
chem2_pdb_prot3_pdb.set_upstream(chem1_pdb_prot1_pdb)

Exemplo n.º 2
0
t1 = PythonOperator(
    task_id='clear_scrape_folder',
    python_callable=clear_folder,
    dag=dag)

# TODO properly import python classes
t2 = BashOperator(
    task_id='scrape_profile_images',
    bash_command='cd {} && scrapy crawl csgrad'.format(cspeople_scraper),
    dag=dag)

t3 = PythonOperator(
    task_id='scrape_progress',
    python_callable=print_scrape_in_progress,
    dag=dag)

t4 = BashOperator(
    task_id='create_landmarks',
    bash_command='cd {} && python landmark.py'.format(averageface_path),
    dag=dag)

t5 = BashOperator(
    task_id='create_average_face',
    bash_command='cd {} && python averageface.py'.format(averageface_path),
    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t1)
t4.set_upstream(t2)
t4.set_upstream(t3)
t5.set_upstream(t4)
"""
Executing tasks at a particular time of the day using sensor operator.
"""
from airflow import DAG
from airflow.operators import BashOperator, TimeSensor
from datetime import datetime, timedelta, time


default_args = {"owner": "Samarth", "start_date": datetime(2016, 03, 15, 12)}

# "schedule_interval" is your cron expression you can write any cron expression like unix cron.
dag = DAG("airflow_task_with_time_sensor", default_args=default_args, schedule_interval="1 * * * *")

bash_task = BashOperator(
    task_id="dependency_for_sensor", bash_command='echo "Sensor would only be enabled after I am done!"', dag=dag
)

# Sensor operator takes "target_time" which is a specific time in a day irrespective of date/day.
# Sensor is executed once the target time has passed. In this case after 10:55 at morning.
sensor_task = TimeSensor(task_id="sensor_task", target_time=time(10, 55, 1, 1), dag=dag)

post_sensor_task = BashOperator(
    task_id="post_sensor_task", bash_command='echo "I am done, it means sensor has done its job."', dag=dag
)

# Setting up the correct dependencies for defined tasks.
sensor_task.set_upstream(bash_task)
post_sensor_task.set_upstream(sensor_task)
Exemplo n.º 4
0
                    'mansortdir': MANSORT_DIR},
            dag=dag)

        email_me = EmailOperator(
            task_id='email_me',
            to=default_args['email'],
            subject='%s is complete' % dag_id,
            html_content='You may now manually sort on NIAO',
            dag=dag)

        slack_it = SlackAPIPostOperator(
            task_id='slack_it',
            token=SLACK_TOKEN,
            text='%s is complete' % dag_id,
            channel='#ephys',
            dag=dag)

        make_kwd_task.set_upstream(make_klusta_dir_task)
        phy_task.set_upstream(make_kwd_task)
        #merge_events_task.set_upstream(phy_task)
        clear_phy_task.set_upstream(phy_task)
        make_kwik_bak_dir_task.set_upstream(phy_task)
        mv_kwik_bak_task.set_upstream(make_kwik_bak_dir_task)
        #rsync_task.set_upstream(merge_events_task)
        rsync_task.set_upstream(clear_phy_task)
        rsync_task.set_upstream(mv_kwik_bak_task)
        email_me.set_upstream(rsync_task)
        slack_it.set_upstream(rsync_task)
     
        globals()[dag_id] = dag
    bash_command=create_command7,
    dag=dag)

# drop and create BigQuery table if exists
create_command8 = 'su {{ params.username }} -c "/home/hduser/dba/bin/python/etl_python_oracle_to_aerospike_and_GCP.ksh -O 5 > "/d4T/hduser/airflow/run_logs/t8_etl_python_oracle_to_aerospike_and_GCP_drop_and_create_BigQuery_table"_"`date +%Y%m%d_%H%M`"."log" 2>&1 "'
t8 = BashOperator(task_id='t8_drop_and_create_BigQuery_table_if_exists',
                  bash_command=create_command8,
                  dag=dag)

# Load data into BigQuery table from csv file
create_command9 = 'su {{ params.username }} -c "/home/hduser/dba/bin/python/etl_python_oracle_to_aerospike_and_GCP.ksh -O 6 > "/d4T/hduser/airflow/run_logs/t9_etl_python_oracle_to_aerospike_and_GCP_load_data_into_BigQuery_table_from_CSV_file"_"`date +%Y%m%d_%H%M`"."log" 2>&1 "'
t9 = BashOperator(task_id='t9_Load_data_into_BigQuery_table_from_csv_file',
                  bash_command=create_command9,
                  dag=dag)

# Read data from BigQuery table
create_command10 = 'su {{ params.username }} -c "/home/hduser/dba/bin/python/etl_python_oracle_to_aerospike_and_GCP.ksh -O 7 > "/d4T/hduser/airflow/run_logs/t10_etl_python_oracle_to_aerospike_and_GCP_Read_data_from_BigQuery_table"_"`date +%Y%m%d_%H%M`"."log" 2>&1 "'
t10 = BashOperator(task_id='t10_Read_data_from_BigQuery_table',
                   bash_command=create_command10,
                   dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t2)
t4.set_upstream(t1)
t5.set_upstream(t4)
t6.set_upstream(t5)
t7.set_upstream(t1)
t8.set_upstream(t7)
t9.set_upstream(t8)
t10.set_upstream(t9)
Exemplo n.º 6
0
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=1),
}

dag = DAG('HelloWorld4', default_args=default_args)

# t1, t2, t3 and t4 are examples of tasks created using operators

t1 = BashOperator(task_id='task_1',
                  bash_command='echo "Hello World from Task 1"',
                  dag=dag)

t2 = BashOperator(task_id='task_2',
                  bash_command='echo "Hello World from Task 2"',
                  dag=dag)

t3 = BashOperator(task_id='task_3',
                  bash_command='echo "Hello World from Task 3"',
                  dag=dag)

t4 = BashOperator(task_id='task_4',
                  bash_command='echo "Hello World from Task 4"',
                  dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t1)
t4.set_upstream(t2)
t4.set_upstream(t3)
default_args = {
    'owner': 'Samarth',
    'start_date': datetime(2016, 03, 15, 12),
}

# "schedule_interval" is your cron expression you can write any cron expression like unix cron.
dag = DAG('airflow_task_with_hdfs_sensor', default_args=default_args, schedule_interval="1 * * * *")

bash_task = BashOperator(
    task_id='dependency_for_hdfs_sensor',
    bash_command='echo "HDFS sensor would only be enabled after I am done!"',
    dag=dag)

# Sensor operator takes "filepath" to check if this file is present in hdfs or not.
# "hdfs_conn_id" is configured in ui Admin--> Connection.
hdfs_sensor_task = HdfsSensor(
    task_id='hdfs_sensor_task',
    filepath='/user/samarthg/input2',
    hdfs_conn_id='webhdfs_default',
    dag=dag)

post_hdfs_sensor_task = BashOperator(
    task_id='post_hdfs_sensor_task',
    bash_command='echo "I am done, it means sensor has done its job."',
    dag=dag)

# Setting up the correct dependencies for defined tasks.
hdfs_sensor_task.set_upstream(bash_task)
post_hdfs_sensor_task.set_upstream(hdfs_sensor_task)
    python_callable=CheckReadLogs(),
    dag=dag)

put_file = PythonOperator(
    task_id='put-file-to-s3',
    python_callable=DataPutter(),
    dag=dag)

delete_object = PythonOperator(
    task_id='delete-object-from-s3',
    python_callable=DeleteObject(),
    dag=dag)

cleanup = BashOperator(
    task_id='cleanup',
    bash_command=rm_file,
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag)



get_file.set_upstream(put_file)
hello_world_docker_write_logs.set_upstream(get_file)
check_read_logs.set_upstream(hello_world_docker_write_logs)
cleanup.set_upstream(check_read_logs)
cleanup.set_upstream(get_file)
delete_object.set_upstream(get_file)



Exemplo n.º 9
0
        #     task_id='rsync',
        #     bash_command=as_user(rsync_command, USER),
        #     params={'klustadir': KLUSTA_DIR,
        #             'mansortdir': MANSORT_DIR,
        #             'mansorthost': MANSORT_HOST},
        #     dag=dag)

        email_me = EmailOperator(
            task_id='email_me',
            to=default_args['email'],
            subject='%s is complete' % dag_id,
            html_content='You may now manually sort on NIAO',
            dag=dag)


        make_kwd_task.set_upstream(make_klusta_dir_task)
        phy_task.set_upstream(make_kwd_task)
        #merge_events_task.set_upstream(phy_task)
        clear_phy_task.set_upstream(phy_task)
        make_kwik_bak_dir_task.set_upstream(phy_task)
        mv_kwik_bak_task.set_upstream(make_kwik_bak_dir_task)
        # make_mansort_dir_task.set_upstream(phy_task)
        # rsync_task.set_upstream(clear_phy_task)
        # rsync_task.set_upstream(mv_kwik_bak_task)
        # rsync_task.set_upstream(make_mansort_dir_task)
        # email_me.set_upstream(rsync_task)
        email_me.set_upstream(mv_kwik_bak_task)
        email_me.set_upstream(clear_phy_task)
     
        globals()[dag_id] = dag
def my_py_command(ds, **kwargs):
    # Print out the "foo" param passed in via
    # `airflow test example_passing_params_via_test_command run_this <date>
    # -tp '{"foo":"bar"}'`
    if kwargs["test_mode"]:
        print(" 'foo' was passed in via test={} command : kwargs[params][foo] \
               = {}".format( kwargs["test_mode"], kwargs["params"]["foo"]) )
    # Print out the value of "miff", passed in below via the Python Operator
    print(" 'miff' was passed in via task params = {}".format( kwargs["params"]["miff"]) )
    return 1

my_templated_command = """
    echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} "
    echo " 'miff was passed in via BashOperator with value {{ params.miff }} "
"""

run_this = PythonOperator(
    task_id='run_this',
    provide_context=True,
    python_callable=my_py_command,
    params={"miff":"agg"},
    dag=dag)

also_run_this = BashOperator(
    task_id='also_run_this',
    bash_command=my_templated_command,
    params={"miff":"agg"},
    dag=dag)
also_run_this.set_upstream(run_this)
local_dir = "/tmp/"
# define the location where you want to store in HDFS
hdfs_dir = " /tmp/"

for channel in to_channels:

    file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"

    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
        bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " +
                     local_dir + file_name +
                     hdfs_dir + channel + "/",
        dag=dag)

    load_to_hdfs.set_upstream(analyze_tweets)

    load_to_hive = HiveOperator(
        task_id="load_" + channel + "_to_hive",
        hql="LOAD DATA INPATH '" +
            hdfs_dir + channel + "/" + file_name + "' "
            "INTO TABLE " + channel + " "
            "PARTITION(dt='" + dt + "')",
        dag=dag)
    load_to_hive.set_upstream(load_to_hdfs)
    load_to_hive.set_downstream(hive_to_mysql)

for channel in from_channels:
    file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv"
    load_to_hdfs = BashOperator(
        task_id="put_" + channel + "_to_hdfs",
Exemplo n.º 12
0
            params={'rasterdir': RASTER_DIR},
            on_success_callback = lambda c: set_perms(c['params']['rasterdir'],default_args['owner']), 
            dag=dag)

        make_raster_task = BashOperator(
            task_id='make_rasters',
            bash_command=make_raster_cmd,
            env={'PATH': ANACONDA_PATH},
            params={'postphydir': POSTPHY_DIR,
                    'ecanalysispath': ECANALYSIS_PATH,
                    'rasterdir': RASTER_DIR},
            dag=dag)

    ############ Report Completion
        email_me = EmailOperator(
            task_id='email_me',
            to=default_args['email'],
            subject='%s is merged' % dag_id,
            html_content='You may commence analysis.',
            dag=dag)


        rsync_task.set_upstream(make_postphy_dir_task)
        merge_events_task.set_upstream(rsync_task)
        kwik2pandas_task.set_upstream(merge_events_task)
        email_me.set_upstream(kwik2pandas_task)
        make_raster_dir_task.set_upstream(kwik2pandas_task)
        make_raster_task.set_upstream(make_raster_dir_task)
     
        globals()[dag_id] = dag
Exemplo n.º 13
0
#Run Camus to pull messages from Kafka into HDFS
camus_a = BashOperator(
        task_id = 'camus_a',
        bash_command='tasks/run_camus.sh',
        depends_on_past=1,
        dag = dag)

#Run Spark to sum all historical trades and write to Cassandra
trades_batch_a = BashOperator(
        task_id = 'trades_batch_a',
        bash_command='tasks/run_trades_batch.sh',
        depends_on_past=1,
        dag = dag)

#set trades batch after news batch to give it more memory
trades_batch_a.set_upstream(camus_a)

#Update Cassandra's stream 2 table to include counts from the batch run with all the trades summed from stock_count_rts1, which were the trades that came in since task1_camus started running
sum_batch_a_rts2 = BashOperator(
        task_id = 'sum_batch_a_rts2',
        bash_command='tasks/sum_batch_rts2.sh',
        depends_on_past=1,
        dag = dag)

sum_batch_a_rts2.set_upstream(trades_batch_a)

#stop streaming of trades while the database is getting updated
stop_trade_stream_a = BashOperator(
        task_id = 'stop_trade_stream_a',
        bash_command='tasks/stop_trade_stream.sh',
        depends_on_past=1,
Exemplo n.º 14
0
aml_utils = load_source(
    'aml_utils',
    "{pf}/asiamiles_airflow_extensions/utils.py".format(
        pf=configuration.get('core', 'plugins_folder')))

mod_config = aml_utils.load_config(
    "{dag_folder}/pros_etl.cfg".format(
        dag_folder=dirname(realpath(__file__))))

hdfs_home=mod_config['hadoop']['hdfs_home']

copy_rsynced_files_to_hadoop = BashOperator(
    task_id="copy_rsynced_files_to_hadoop",
    bash_command="hadoop fs -put -f /data1/staging/pros/* pros",
    dag=dag)

#spark-shell --master yarn-client

update_seat_idx = BashOperator(
  task_id="update_seat_idx",
  bash_command="cat /data1/airflow/dags/pros-etl/pros_seat_index_hist_load.scala | spark-shell --master yarn-client",
  dag=dag)

update_curve = BashOperator(
  task_id="update_curve",
  bash_command="cat /data1/airflow/dags/pros-etl/pros_bid_price_hist_load.scala | spark-shell --master yarn-client",
  dag=dag)

update_seat_idx.set_upstream(copy_rsynced_files_to_hadoop)
update_curve.set_upstream(copy_rsynced_files_to_hadoop)
Exemplo n.º 15
0
                                      'mansortdir': MANSORT_DIR
                                  },
                                  dag=dag)

        email_me = EmailOperator(
            task_id='email_me',
            to=default_args['email'],
            subject='%s is complete' % dag_id,
            html_content='You may now manually sort on NIAO',
            dag=dag)

        slack_it = SlackAPIPostOperator(task_id='slack_it',
                                        token=SLACK_TOKEN,
                                        text='%s is complete' % dag_id,
                                        channel='#ephys',
                                        dag=dag)

        make_kwd_task.set_upstream(make_klusta_dir_task)
        phy_task.set_upstream(make_kwd_task)
        #merge_events_task.set_upstream(phy_task)
        clear_phy_task.set_upstream(phy_task)
        make_kwik_bak_dir_task.set_upstream(phy_task)
        mv_kwik_bak_task.set_upstream(make_kwik_bak_dir_task)
        #rsync_task.set_upstream(merge_events_task)
        rsync_task.set_upstream(clear_phy_task)
        rsync_task.set_upstream(mv_kwik_bak_task)
        email_me.set_upstream(rsync_task)
        slack_it.set_upstream(rsync_task)

        globals()[dag_id] = dag
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('dag', default_args=default_args)

# t1, t2 and t3 are examples of tasks created by instatiating operators
requirements = BashOperator(
    task_id='requirements',
    bash_command='pip install -r requirements.txt',
    dag=dag)

data = BashOperator(
    task_id='data',
    bash_command='python src/make_dataset.py',
    dag=dag)

clean = BashOperator(
    task_id='clean',
    bash_command='find . -name "*.pyc" -exec rm {} \;',
    dag=dag)

lint = BashOperator(
    task_id='flake8',
    bash_command='flake8 .',
    dag=dag)

data.set_upstream(requirements)
Exemplo n.º 17
0
    dag=dag)

t1bis = BashOperator(
    task_id='send_to_linkproxy_weekly_catalog',
    bash_command=
    'su datamanufactory -c "cd /srv/datamanufactory/data-workflow/ && /anaconda3/bin/python 1bis_send_catalog_to_linkproxy.py run"',
    dag=dag)

t2 = BashOperator(task_id='wait_webhook_to_hook',
                  bash_command='su datamanufactory -c "sleep 9200"',
                  dag=dag)

t3 = BashOperator(
    task_id='csv_detective_analysis',
    bash_command=
    'su datamanufactory -c "source /anaconda3/etc/profile.d/conda.sh && cd /srv/datamanufactory/data-workflow/ && conda activate csvdeploy && python 3_csv_detective_analysis.py run"',
    dag=dag)

t4 = BashOperator(
    task_id='send_metadata_to_elk',
    bash_command=
    'su datamanufactory -c "cd /srv/datamanufactory/data-workflow/ && /anaconda3/bin/python 4_ingest_elk.py"',
    dag=dag)

t1.set_upstream(t0)
t1bis.set_upstream(t0)
t2.set_upstream(t1)
t2.set_upstream(t1bis)
t3.set_upstream(t2)
t4.set_upstream(t3)
Exemplo n.º 18
0
    def poke(self, context):
        element = self.elementName
        if element not in elementList:
            return True
        else:
            return False


default_dag_args = {
    'start_date': yesterday,
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': datetime.timedelta(minutes=5),
}

with models.DAG('CheckListSensor',
                schedule_interval=datetime.timedelta(days=1),
                default_args=default_dag_args) as dag:

    createBucket = BashOperator(task_id='yes_it_exists',
                                bash_command='echo DONE',
                                dag=dag)

    IsBucketExists = ListSensor(task_id='isElementExist',
                                elementName=10,
                                dag=dag)

createBucket.set_upstream(IsBucketExists)
Exemplo n.º 19
0
}

dag = DAG('jarvisworkflow', default_args=default_args)

t1 = BashOperator(task_id='train',
                  bash_command='python3 ' + abspath + '/crawler.py tr',
                  dag=dag)

t2 = BashOperator(task_id='test',
                  bash_command='python3 ' + abspath + '/crawler.py te',
                  dag=dag)

t3 = BashOperator(task_id='cleantr',
                  bash_command='python3 ' + abspath + '/cleaner.py tr',
                  dag=dag)
t3.set_upstream(t1)

t4 = BashOperator(task_id='cleante',
                  bash_command='python3 ' + abspath + '/cleaner.py te',
                  dag=dag)
t4.set_upstream(t2)

t5 = BashOperator(task_id='predict',
                  bash_command='python3 ' + abspath + '/predictor.py',
                  dag=dag)
t5.set_upstream(t3)

t6 = BashOperator(task_id='validate',
                  bash_command='python3 ' + abspath + '/validator.py',
                  dag=dag)
t6.set_upstream([t4, t5])
Exemplo n.º 20
0
t3 = BashOperator(
    task_id='cloud-events-usage-load_rawxml',
    #bash_command='/home/airflow/airflow-jobs/scripts/cloud_events_usage/backup_cloud_usage_events_rawxml.sh;',
    bash_command=script_folder +
    'cloud_events_usage/backup_cloud_usage_events_rawxml.sh;',
    dag=dag)
t4 = BashOperator(
    task_id='cloud-events-usage-load_rawxml1',
    #bash_command='/home/airflow/airflow-jobs/scripts/cloud_events_usage/backup_cloud_usage_events_rawxml1.sh;',
    bash_command=script_folder +
    'cloud_events_usage/backup_cloud_usage_events_rawxml1.sh;',
    dag=dag)
t5 = BashOperator(
    task_id='cloud-events-usage-load_glance_and_nova',
    #bash_command='/home/airflow/airflow-jobs/scripts/cloud_events_usage/backup_glance_and_nova.sh;',
    bash_command=script_folder +
    'cloud_events_usage/backup_glance_and_nova.sh;',
    dag=dag)
t6 = BashOperator(
    task_id='cloud-events-usage-verify-load',
    #bash_command='/home/airflow/airflow-jobs/scripts/cloud_events_usage/checkDailyLoad.sh;',
    bash_command=script_folder +
    'ods_archiving/checkDailyLoad.sh cloud_usage_events;',
    dag=dag,
    trigger_rule=TR.ALL_DONE)
t6.set_upstream(t1)
t6.set_upstream(t2)
t6.set_upstream(t3)
t6.set_upstream(t4)
t6.set_upstream(t5)
Exemplo n.º 21
0
	bash_command='./sqoop_incremental.sh',
        dag=dag
)

# merge the data from Mysql table to HDFS
task2 = BashOperator(
        task_id= 'sqoop_merge_import',
        bash_command='./sqoop_merge.sh',
        dag=dag
)

# Inserting the data from Hive external table to the target table
task3 = HiveOperator(
        task_id= 'hive_insert',
        hql='LOAD DATA INPATH "/user/cloudera/employeeprofilemerge" OVERWRITE INTO TABLE employee_profile;',
        depends_on_past=True,
        dag=dag
)

# Inserting the data from Hive table with masked ssn external table to the target table
task4 = HiveOperator(
        task_id= 'hive_insert_masked',
        hql='add jar /home/cloudera/Masking.jar;create TEMPORARY function masking as \'Masking\';INSERT OVERWRITE table employee_profile_masked SELECT profile_id,first_name,last_name,modified_date,masking(ssn) FROM employee_profile;',
        depends_on_past=True,
        dag=dag
)
# defining the job dependency
task2.set_upstream(task1)
task3.set_upstream(task2)
task4.set_upstream(task3)
Exemplo n.º 22
0
    'tutorial_mod', default_args=default_args, schedule_interval=timedelta(1))

# t1, t2 and t3 are examples of tasks created by instatiating operators
t1 = BashOperator(
    task_id='print_date',
    bash_command='date',
    dag=dag)

t2 = BashOperator(
    task_id='sleep',
    bash_command='sleep 5',
    retries=3,
    dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(
    task_id='templated',
    bash_command=templated_command,
    params={'my_param': 'Parameter I passed in'},
    dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t1)
Exemplo n.º 23
0
          schedule_interval=timedelta(2))

#Run Camus to pull messages from Kafka into HDFS
camus_a = BashOperator(task_id='camus_a',
                       bash_command='tasks/run_camus.sh',
                       depends_on_past=1,
                       dag=dag)

#Run Spark to sum all historical trades and write to Cassandra
trades_batch_a = BashOperator(task_id='trades_batch_a',
                              bash_command='tasks/run_trades_batch.sh',
                              depends_on_past=1,
                              dag=dag)

#set trades batch after news batch to give it more memory
trades_batch_a.set_upstream(camus_a)

#Update Cassandra's stream 2 table to include counts from the batch run with all the trades summed from stock_count_rts1, which were the trades that came in since task1_camus started running
sum_batch_a_rts2 = BashOperator(task_id='sum_batch_a_rts2',
                                bash_command='tasks/sum_batch_rts2.sh',
                                depends_on_past=1,
                                dag=dag)

sum_batch_a_rts2.set_upstream(trades_batch_a)

#stop streaming of trades while the database is getting updated
stop_trade_stream_a = BashOperator(task_id='stop_trade_stream_a',
                                   bash_command='tasks/stop_trade_stream.sh',
                                   depends_on_past=1,
                                   dag=dag)
Exemplo n.º 24
0
    'depends_on_past': False,
    'start_date': datetime.combine(today, time(20, 00, 0)) - timedelta(days=1),
    'retries': 3,
    'retry_delay': timedelta(minutes=5),
}
TR = TriggerRule
dag = DAG('usl_ods_v1',
          default_args=default_args,
          schedule_interval=timedelta(days=1))
script_folder = DAGS_FOLDER + '/../scripts/'
t0 = BashOperator(task_id='usldb_ods_incremental1',
                  bash_command=script_folder +
                  'usl_ods/usldb_ods_incremental1.sh;',
                  dag=dag)
t1 = BashOperator(task_id='usldb_ods_incremental2',
                  bash_command=script_folder +
                  'usl_ods/usldb_ods_incremental2.sh;',
                  dag=dag)
t2 = BashOperator(task_id='usldb_ods_full_load_all',
                  bash_command=script_folder +
                  'usl_ods/usldb_ods_full_load_all.sh;',
                  dag=dag)
t3 = BashOperator(task_id='verify_load',
                  bash_command=script_folder +
                  'ods_archiving/checkDailyLoad.sh usldb_ods;',
                  dag=dag,
                  trigger_rule=TR.ALL_DONE)
t3.set_upstream(t0)
t3.set_upstream(t1)
t3.set_upstream(t2)
Exemplo n.º 25
0
          schedule_interval=schedule_interval,
          default_args=default_args,
          max_active_runs=1)

dbt_clone = BashOperator(
    task_id='dbt_clone',
    bash_command=
    'cd ~/project && git fetch --all && git reset --hard origin/master',
    dag=dag)

dbt_deps = BashOperator(
    task_id='dbt_deps',
    bash_command='cd ~/project && dbt deps  --profile=warehouse --target=prod',
    dag=dag)

dbt_deps.set_upstream(dbt_clone)

dbt_clean = BashOperator(
    task_id='dbt_clean',
    bash_command='cd ~/project && dbt clean  --profile=warehouse --target=prod',
    dag=dag)
dbt_clean.set_upstream(dbt_deps)

dbt_archive = BashOperator(
    task_id='dbt_archive',
    bash_command=
    'cd ~/project && dbt archive  --profile=warehouse --target=prod',
    dag=dag)

dbt_archive.set_upstream(dbt_clean)
          default_args=default_args,
          schedule_interval="0 * * * *",
          start_date=datetime.now() - timedelta(minutes=10))

c = """
sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_parse_process.sh
"""

c1 = """
sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_ftp_process.sh
"""

c2 = """
sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_post_parse_process.sh
"""

pullFiles = BashOperator(task_id='zteumts--pullFiles',
                         bash_command=c1,
                         dag=dag)

parseFiles = BashOperator(task_id='zteumts--parseFiles',
                          bash_command=c,
                          dag=dag)

moveFilesToDatalake = BashOperator(task_id='zteumts--moveFilesToDataLake',
                                   bash_command=c2,
                                   dag=dag)

parseFiles.set_upstream(pullFiles)
moveFilesToDatalake.set_upstream(parseFiles)
Exemplo n.º 27
0
    # The contents of the obtained file_path_sp directory
    file_names_sp = os.listdir(file_path_sp)

    # Get ul file path
    file_path_ul = task_name_ul + '/' + option + '/bin/'
    # The contents of the obtained file_path_ul directory
    file_names_ul = os.listdir(file_path_ul)
    # Loop to get sp each file name
    for file_name_sp in file_names_sp:
        if os.path.isfile(file_path_sp + file_name_sp):
            task_option_path = '/usr/bin/perl ' + task_name_sp + '/' + option + '/bin/' + file_name_sp + ' '
            task_option_path_ld = '/usr/bin/perl ' + task_name_ld + '/' + option + '/bin/StructuralLoad.pl '
            t = BashOperator(task_id='LD_' + option,
                             bash_command=task_option_path_ld,
                             dag=dag)
            t.set_upstream(branching)
            dummy_follow = BashOperator(task_id='SP_' + option,
                                        bash_command=task_option_path,
                                        dag=dag)
            t.set_downstream(dummy_follow)
            dummy_follow.set_downstream(join)
    # Loop to get ul each file name
    for file_name_ul in file_names_ul:
        if os.path.isfile(file_path_ul + file_name_ul):
            task_option_path = '/usr/bin/perl ' + task_name_ul + '/' + option + '/bin/' + file_name_ul + ' '
            dummy_follow = BashOperator(task_id='UL_' + option,
                                        bash_command=task_option_path,
                                        dag=dag)
            dummy_follow.set_upstream(branching)
            dummy_follow.set_downstream(join)
Exemplo n.º 28
0
        # rsync_task = BashOperator(
        #     task_id='rsync',
        #     bash_command=as_user(rsync_command, USER),
        #     params={'klustadir': KLUSTA_DIR,
        #             'mansortdir': MANSORT_DIR,
        #             'mansorthost': MANSORT_HOST},
        #     dag=dag)

        email_me = EmailOperator(
            task_id='email_me',
            to=default_args['email'],
            subject='%s is complete' % dag_id,
            html_content='You may now manually sort on NIAO',
            dag=dag)

        make_kwd_task.set_upstream(make_klusta_dir_task)
        phy_task.set_upstream(make_kwd_task)
        #merge_events_task.set_upstream(phy_task)
        clear_phy_task.set_upstream(phy_task)
        make_kwik_bak_dir_task.set_upstream(phy_task)
        mv_kwik_bak_task.set_upstream(make_kwik_bak_dir_task)
        # make_mansort_dir_task.set_upstream(phy_task)
        # rsync_task.set_upstream(clear_phy_task)
        # rsync_task.set_upstream(mv_kwik_bak_task)
        # rsync_task.set_upstream(make_mansort_dir_task)
        # email_me.set_upstream(rsync_task)
        email_me.set_upstream(mv_kwik_bak_task)
        email_me.set_upstream(clear_phy_task)

        globals()[dag_id] = dag
Exemplo n.º 29
0
    'owner': 'airflow',
    'depends_on_past': False,
    'retries': 0,
    'start_date': datetime(2019, 1, 01, 0, 0),
    'retry_delay':
    datetime.timedelta(minutes=5)  # airflow will run next quarter
}

dag = DAG('onemortgage_monthly',
          default_args=airflow_args,
          schedule_interval='@monthly')

now = datetime.now()

download_fannie = BashOperator(
    task_id='download_fannie',
    bash_command="/home/ubuntu/OneMortgage/src/bash/download_fannie.sh",
    dag=dag)

download_freddie = BashOperator(
    task_id='download_freddie',
    bash_command='/home/ubuntu/OneMortgage/src/bash/download_freddie.sh',
    dag=dag)

process_data = BashOperator(
    task_id='process_data',
    bash_command='/home/ubuntu/OneMortgage/src/bash/run_batch.sh',
    dag=dag)

process_data.set_upstream([download_freddie, download_fannie])
Exemplo n.º 30
0
    task_id='ods_load_batch_1',
    bash_command=script_folder +
    'operational_reporting_location_manager/ods_load_batch_1.sh;',
    dag=dag)
t2 = BashOperator(
    task_id='ods_load_batch_2',
    bash_command=script_folder +
    'operational_reporting_location_manager/ods_load_batch_2.sh;',
    dag=dag)
t3 = BashOperator(
    task_id='ods_load_batch_3',
    bash_command=script_folder +
    'operational_reporting_location_manager/ods_load_batch_3.sh;',
    dag=dag)
t4 = BashOperator(
    task_id='ods_load_batch_4',
    bash_command=script_folder +
    'operational_reporting_location_manager/ods_load_batch_4.sh;',
    dag=dag)
t5 = BashOperator(
    task_id='verify_load',
    bash_command=script_folder +
    'ods_archiving/checkDailyLoad.sh operational_reporting_location_manager;',
    dag=dag,
    trigger_rule=TR.ALL_DONE)
t5.set_upstream(t0)
t5.set_upstream(t1)
t5.set_upstream(t2)
t5.set_upstream(t3)
t5.set_upstream(t4)
Exemplo n.º 31
0
            params={'rasterdir': RASTER_DIR},
            on_success_callback=lambda c: set_perms(c['params']['rasterdir'],
                                                    default_args['owner']),
            dag=dag)

        make_raster_task = BashOperator(task_id='make_rasters',
                                        bash_command=make_raster_cmd,
                                        env={'PATH': ANACONDA_PATH},
                                        params={
                                            'postphydir': POSTPHY_DIR,
                                            'ecanalysispath': ECANALYSIS_PATH,
                                            'rasterdir': RASTER_DIR
                                        },
                                        dag=dag)

        ############ Report Completion
        email_me = EmailOperator(task_id='email_me',
                                 to=default_args['email'],
                                 subject='%s is merged' % dag_id,
                                 html_content='You may commence analysis.',
                                 dag=dag)

        rsync_task.set_upstream(make_postphy_dir_task)
        merge_events_task.set_upstream(rsync_task)
        kwik2pandas_task.set_upstream(merge_events_task)
        email_me.set_upstream(kwik2pandas_task)
        make_raster_dir_task.set_upstream(kwik2pandas_task)
        make_raster_task.set_upstream(make_raster_dir_task)

        globals()[dag_id] = dag
Exemplo n.º 32
0
    'start_date': datetime(2016, 4, 29),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 2,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('dag', default_args=default_args)

# t1, t2 and t3 are examples of tasks created by instatiating operators
requirements = BashOperator(task_id='requirements',
                            bash_command='pip install -r requirements.txt',
                            dag=dag)

data = BashOperator(task_id='data',
                    bash_command='python src/make_dataset.py',
                    dag=dag)

clean = BashOperator(task_id='clean',
                     bash_command='find . -name "*.pyc" -exec rm {} \;',
                     dag=dag)

lint = BashOperator(task_id='flake8', bash_command='flake8 .', dag=dag)

data.set_upstream(requirements)
Exemplo n.º 33
0
}

dag = DAG('ldp', schedule_interval='0 10 * * *', default_args=default_args)

# -------------- gpsNo, deviceNo 对照 begin --------------
gps_contrast = BashOperator(
    task_id='gpsnoContrastJob',
    bash_command='echo "nvr gpsNo deviceNo contrast..." | sleep 5',
    dag=dag)

gps_contrast_kmx = BashOperator(
    task_id='gpsContrastJob',
    bash_command='echo "nvr gpsNo template by kmx..." | sleep 5',
    dag=dag)

gps_contrast_kmx.set_upstream(gps_contrast)
# -------------- gpsNo, deviceNo 对照 end --------------

# -------------- ldp 数据 begin --------------
chan_net_list = BashOperator(
    task_id='lgChannetListJob',
    bash_command='echo "ldp chan_net_list..." | sleep 5',
    dag=dag)

chan_net_list.set_upstream(gps_contrast_kmx)

city_list = BashOperator(task_id='lgCityListJob',
                         bash_command='echo "ldp city_list..." | sleep 5',
                         dag=dag)

city_list.set_upstream(gps_contrast_kmx)
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2016, 11, 1),
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 5,
    'retry_delay': timedelta(minutes=5)
}

dag = DAG('s3_dag_test', default_args=default_args, schedule_interval= '@once')

t1 = BashOperator(
    task_id='bash_test',
    bash_command='echo "hello world" > s3_conn_test.txt',
    dag=dag)

sensor = S3KeySensor(
    task_id='check_s3_for_file_in_s3',
    bucket_key='*',
    wildcard_match=True,
    bucket_name='airflow-input-sprite',
    s3_conn_id='aws_default',
    timeout=18*60*60,
    poke_interval=120,
    dag=dag)

t1.set_upstream(sensor)

def my_py_command(ds, **kwargs):
    # Print out the "foo" param passed in via
    # `airflow test example_passing_params_via_test_command run_this <date>
    # -tp '{"foo":"bar"}'`
    if kwargs["test_mode"]:
        print(" 'foo' was passed in via test={} command : kwargs[params][foo] \
               = {}".format(kwargs["test_mode"], kwargs["params"]["foo"]))
    # Print out the value of "miff", passed in below via the Python Operator
    print(" 'miff' was passed in via task params = {}".format(
        kwargs["params"]["miff"]))
    return 1


my_templated_command = """
    echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} "
    echo " 'miff was passed in via BashOperator with value {{ params.miff }} "
"""

run_this = PythonOperator(task_id='run_this',
                          provide_context=True,
                          python_callable=my_py_command,
                          params={"miff": "agg"},
                          dag=dag)

also_run_this = BashOperator(task_id='also_run_this',
                             bash_command=my_templated_command,
                             params={"miff": "agg"},
                             dag=dag)
also_run_this.set_upstream(run_this)
Exemplo n.º 36
0
    dag=dag)

t3 = BashOperator(
    task_id='check_running_stat_job',
    bash_command='echo "Task 3"',
    dag=dag)
	
t4 = BashOperator(
    task_id='spark_etl',
    bash_command='exit 123"',
    dag=dag)
	
t5 = BashOperator(
    task_id='update_orc_file_variable',
    bash_command='echo "Task 5"',
    dag=dag)
	
t6 = BashOperator(
    task_id='remove_conf_file',
    bash_command='echo "Task 6"',
    dag=dag,
	trigger_rule='all_done')

t2.set_upstream(t1)
t3.set_upstream(t2)
t4.set_upstream(t3)
t5.set_upstream(t4)
t6.set_upstream(t2)
t6.set_upstream(t3)
t6.set_upstream(t4)
t6.set_upstream(t5)
def export_to_gcp_dag(_sub_dag_id, _schedule_interval, _queue, _default_args,
                      _export_hql_dict, _params, _dataproc_config):
    dag = DAG(dag_id=_sub_dag_id,
              schedule_interval=_schedule_interval,
              default_args=_default_args)

    export_data_hql = _export_hql_dict["export_data_hql"]
    add_dataproc_partition_hql = _export_hql_dict["add_dataproc_partition_hql"]
    drop_tmp_table_hql = _export_hql_dict["drop_tmp_table_hql"]

    def gen_date_str_nodash(execution_date, days_delta=0, hours_delta=0):
        from pytz import timezone, utc
        from datetime import datetime, timedelta

        pacific_timezone = "US/Pacific"
        date = utc.localize(execution_date)
        date = date.astimezone(timezone(pacific_timezone))

        if days_delta:
            date += timedelta(days=days_delta)

        if hours_delta:
            date += timedelta(hours=hours_delta)

        return date.strftime("%Y%m%d")

    _params.update({"gen_date_str_nodash": gen_date_str_nodash})

    export_data = HiveOperator(task_id="export_data",
                               hql=export_data_hql,
                               params=_params,
                               queue=_queue,
                               dag=dag)

    add_dataproc_partition = DataProcHiveOperator(
        task_id="add_dataproc_partition",
        dataproc_cluster=_dataproc_config["dataproc_cluster"],
        region=_dataproc_config["region"],
        gcp_conn_id=_dataproc_config["gcp_conn_id"],
        query=add_dataproc_partition_hql,
        params=_params,
        queue=_queue,
        dag=dag)

    drop_tmp_table = HiveOperator(task_id="drop_tmp_table",
                                  hql=drop_tmp_table_hql,
                                  params=_params,
                                  queue=_queue,
                                  dag=dag)

    add_dataproc_partition.set_upstream(export_data)
    drop_tmp_table.set_upstream(add_dataproc_partition)

    if _params.get("stamp_file_path", None) is not None:
        gcp_conf_true = 'google.cloud.auth.service.account.enable=true'
        gcp_conf_keyfile = 'google.cloud.auth.service.account.json.keyfile={{ params.gcp_keyfile }}'

        add_success_stamp_file = BashOperator(
            task_id="add_success_stamp_file",
            bash_command="hadoop fs -D " + gcp_conf_true + " -D " +
            gcp_conf_keyfile + " -touchz " +
            _params.get("stamp_file_path", None),
            params=_params,
            queue=_queue,
            dag=dag)
        add_success_stamp_file.set_upstream(drop_tmp_table)

    return dag
        # tasks.
        trigger_rule=TriggerRule.ALL_DONE)

    # Delete  gcs files in the timestamped transformed folder.
    delete_transformed_files = BashOperator(
        task_id='delete_transformed_files',
        bash_command="gsutil -m rm -r gs://{{ var.value.gcs_bucket }}" +
        "/{{ dag_run.conf['transformed_path'] }}/")

    # If the spark job or BQ Load fails we rename the timestamped raw path to
    # a timestamped failed path.
    move_failed_files = BashOperator(
        task_id='move_failed_files',
        bash_command="gsutil mv gs://{{ var.value.gcs_bucket }}" +
        "/{{ dag_run.conf['raw_path'] }}/ " +
        "gs://{{ var.value.gcs_bucket}}" +
        "/{{ dag_run.conf['failed_path'] }}/",
        trigger_rule=TriggerRule.ONE_FAILED)
    # Set the dag property of the first Operators, this will be inherited by
    # downstream Operators.

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_pyspark)

    submit_pyspark.set_downstream([delete_cluster, bq_load])

    bq_load.set_downstream(delete_transformed_files)

    move_failed_files.set_upstream([bq_load, submit_pyspark])
Exemplo n.º 39
0
]
# copy table to bi
#bitables = ['hardware', 'hardwareios']
bitables = []

for table in tables:
    imp = BashOperator(
        task_id='import_' + table,
        bash_command=
        '/disk1/bdl/etl/ETL/imp_mongo_doc_with_date_input.sh {table} {begin} {end} > /disk1/bdl/etl/ETL/log/{table}.log '
        .format(table=table, begin='{{ ds }}', end='{{ tomorrow_ds }}'),
        dag=dag)
    if table in bitables:
        bimp = BashOperator(
            task_id='send_2_bi_' + table,
            bash_command=
            '/disk1/bdl/etl/ETL/send_bi_impala_with_date_input.sh {table} {begin} {end}  > /disk1/bdl/etl/ETL/log/BI/{table}.log '
            .format(table=table, begin='{{ ds }}', end='{{ tomorrow_ds }}'),
            dag=dag)
        bimp.set_upstream(imp)
        esucc.set_upstream(bimp)
    else:
        esucc.set_upstream(imp)

imp_software = BashOperator(
    task_id='import_software',
    bash_command=
    '/disk1/bdl/etl/ETL/imp_software_doc_with_date_input.sh {{ ds }} {{ tomorrow_ds }} > /disk1/bdl/etl/ETL/log/software.log ',
    dag=dag)
esucc.set_upstream(imp_software)
Exemplo n.º 40
0
Documentation that goes along with the Airflow tutorial located
[here](http://pythonhosted.org/airflow/tutorial.html)
"""
from airflow import DAG
from airflow.operators import BashOperator
from datetime import datetime, timedelta
from settings import default_args

dag = DAG('reddit_comments', default_args=default_args, schedule_interval=timedelta(2))


pre = BashOperator(
    task_id='setup',
    bash_command='tasks/setup_dirs.sh',
    depends_on_past=False,
    dag=dag)

t1 = BashOperator(
    task_id='ngrams_batch',
    bash_command='tasks/run_ngrams_batch.sh',
    depends_on_past=False,
    dag=dag)

t2 = BashOperator(
    task_id='ngrams_optimize',
    depends_on_past=True,
    bash_command='tasks/optimize_ngrams.sh',
    dag=dag)

t1.set_upstream(pre)
t2.set_upstream(t1)
Exemplo n.º 41
0
}


# Set concurrency and max_active_runs to 1, preventing more than one dag instance
# from being created.
dag = DAG(dag_name, default_args=task_args,
          concurrency=1,
          max_active_runs=1,
          schedule_interval=schedule_interval)


get_env = PythonOperator(
    task_id='get-config-from-s3',
    python_callable=ConfigGetter(),
    dag=dag)

set_variables = PythonOperator(
    task_id='set-variables',
    python_callable=BootStrapper(),
    dag=dag)

cleanup = BashOperator(
    task_id='cleanup',
    bash_command=rm_config,
    trigger_rule='all_done',
    dag=dag)


set_variables.set_upstream(get_env)
cleanup.set_upstream(set_variables)
Exemplo n.º 42
0
			n = n + '_'
		dic[n] = c 
	return dic 

command_dict = dict_from_cmd_list(bash_commands)
tasks = []






for n,c in command_dict.iteritems(): 
	task = BashOperator(task_id=n, bash_command=c, dag=dag, pool='default')
	if len(tasks) > 0: 
		task.set_upstream(tasks[-1])
	tasks.append(task)



job = DagExecutionJob(dag)





# def push(**kwargs):
#     # pushes an XCom without a specific target
#     kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1)

# def push_by_returning(**kwargs):
Exemplo n.º 43
0
#import the required libraries
from airflow import DAG
from datetime import datetime, timedelta
from airflow.operators import BashOperator

#defining the default arguments dictionary
args = {
    'owner': 'airflow',
    'start_date': datetime(2020, 12, 2),  #you can change this start_date
    'retries': 1,
    "retry_delay": timedelta(seconds=10),
}

dag = DAG('Assignment_1', default_args=args)

#task1 is to create a directory 'test_dir' inside dags folder
task1 = BashOperator(task_id='create_directory',
                     bash_command='mkdir -p ~/outputs/test_dir',
                     dag=dag)

#task2 is to get the 'shasum' of 'test_dir' directory
task2 = BashOperator(task_id='get_shasum',
                     bash_command='shasum ~/outputs/test_dir',
                     dag=dag)

#below we are setting up the operator relationships such that task1 will run first than task2
task2.set_upstream(task1)
Exemplo n.º 44
0
}

dag = DAG('monitoring_etl',
          default_args=default_args,
          schedule_interval=timedelta(days=1))

mad = BashOperator(task_id='maas_accounts_download',
                   bash_command=RUN_AS,
                   params={'cmd': 'cron-mad.sh'},
                   dag=dag)

mal = BashOperator(task_id='maas_accounts_load',
                   bash_command=RUN_AS,
                   params={'cmd': 'cron-mal.sh'},
                   dag=dag)
mal.set_upstream(mad)

mmd = BashOperator(task_id='maas_metrics_download',
                   bash_command=RUN_AS,
                   params={
                       'cmd': 'download_from_cf.sh',
                       'path': '/home/maas/maas_report'
                   },
                   dag=dag)
mmd.set_upstream(mal)

mml = BashOperator(task_id='maas_metrics_load',
                   bash_command=RUN_AS,
                   params={'cmd': 'cron-mml.sh'},
                   dag=dag)
mml.set_upstream(mmd)
Exemplo n.º 45
0
branching = BranchPythonOperator(task_id='branching',
                                 python_callable=lambda: 'source_count'
                                 if datetime.now().day <= 7 and datetime.today(
                                 ).weekday() == 6 else 'ignore_not_sunday',
                                 dag=dag)
branching.set_upstream(run_this_first)

esucc = EmailOperator(task_id='email_success_' + dag.dag_id,
                      to=email_addr,
                      subject=dag.dag_id + ' [success] on ' +
                      datetime.now().strftime('%Y-%m-%d'),
                      html_content='Congratulation!',
                      trigger_rule='all_success',
                      dag=dag)

source_count = BashOperator(
    task_id='source_count',
    bash_command='/disk1/source_data_count; ./daily_table_count.sh > out.log ',
    dag=dag)

source_count.set_upstream(branching)
esucc.set_upstream(source_count)

ignore_not_sunday = DummyOperator(task_id='ignore_not_sunday', dag=dag)
ignore_not_sunday.set_upstream(branching)

join = DummyOperator(task_id='join', trigger_rule='all_success', dag=dag)
join << ignore_not_sunday
join << esucc
Exemplo n.º 46
0
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    # 'end_date': datetime(2016, 1, 1),
}

dag = DAG('tutorial', default_args=default_args)

# t1, t2 and t3 are examples of tasks created by instatiating operators
t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag)

t2 = BashOperator(task_id='sleep', bash_command='sleep 5', retries=3, dag=dag)

templated_command = """
    {% for i in range(5) %}
        echo "{{ ds }}"
        echo "{{ macros.ds_add(ds, 7)}}"
        echo "{{ params.my_param }}"
    {% endfor %}
"""

t3 = BashOperator(task_id='templated',
                  bash_command=templated_command,
                  params={'my_param': 'Parameter I passed in'},
                  dag=dag)

t2.set_upstream(t1)
t3.set_upstream(t1)
Exemplo n.º 47
0
from airflow.operators import BashOperator, MySqlOperator
from airflow.models import DAG
from datetime import datetime

default_args = {
    'owner': 'max',
    'start_date': datetime(2014, 9, 1),
    'mysql_dbid': 'local_mysql',
}

dag = DAG(dag_id='example_3')

run_this = BashOperator(
        task_id='also_run_this', bash_command='ls -l', **default_args)
dag.add_task(run_this)

for i in range(5):
    i = str(i)
    task = BashOperator(
            task_id='runme_'+i, 
            bash_command='sleep {{ 10 + macros.random() * 10 }}', 
            **default_args)
    task.set_upstream(run_this)
    dag.add_task(task)