task_id='chem2_pdb_prot2_pdb', bash_command="(cd /working-directory; virtualScreening.py -l chem2.pdb -o result -p prot2.pdb) ", dag=dag) chem2_pdb_prot2_pdb_success_mail = EmailOperator( task_id="chem2_pdb_prot2_pdb_success_mail", to=[u'*****@*****.**'], subject="chem2_pdb_prot2_pdb success", html_content="chem2_pdb_prot2_pdb success", dag=dag) chem2_pdb_prot2_pdb_success_mail.set_upstream(chem2_pdb_prot2_pdb) #chem2_pdb_prot2_pdb.set_upstream( ) chem2_pdb_prot3_pdb = BashOperator( task_id='chem2_pdb_prot3_pdb', bash_command="(cd /working-directory; virtualScreening.py -l chem2.pdb -o result -p prot3.pdb) ", dag=dag) chem2_pdb_prot3_pdb_success_mail = EmailOperator( task_id="chem2_pdb_prot3_pdb_success_mail", to=[u'*****@*****.**'], subject="chem2_pdb_prot3_pdb success", html_content="chem2_pdb_prot3_pdb success", dag=dag) chem2_pdb_prot3_pdb_success_mail.set_upstream(chem2_pdb_prot3_pdb) chem2_pdb_prot3_pdb.set_upstream(chem1_pdb_prot1_pdb)
t1 = PythonOperator( task_id='clear_scrape_folder', python_callable=clear_folder, dag=dag) # TODO properly import python classes t2 = BashOperator( task_id='scrape_profile_images', bash_command='cd {} && scrapy crawl csgrad'.format(cspeople_scraper), dag=dag) t3 = PythonOperator( task_id='scrape_progress', python_callable=print_scrape_in_progress, dag=dag) t4 = BashOperator( task_id='create_landmarks', bash_command='cd {} && python landmark.py'.format(averageface_path), dag=dag) t5 = BashOperator( task_id='create_average_face', bash_command='cd {} && python averageface.py'.format(averageface_path), dag=dag) t2.set_upstream(t1) t3.set_upstream(t1) t4.set_upstream(t2) t4.set_upstream(t3) t5.set_upstream(t4)
""" Executing tasks at a particular time of the day using sensor operator. """ from airflow import DAG from airflow.operators import BashOperator, TimeSensor from datetime import datetime, timedelta, time default_args = {"owner": "Samarth", "start_date": datetime(2016, 03, 15, 12)} # "schedule_interval" is your cron expression you can write any cron expression like unix cron. dag = DAG("airflow_task_with_time_sensor", default_args=default_args, schedule_interval="1 * * * *") bash_task = BashOperator( task_id="dependency_for_sensor", bash_command='echo "Sensor would only be enabled after I am done!"', dag=dag ) # Sensor operator takes "target_time" which is a specific time in a day irrespective of date/day. # Sensor is executed once the target time has passed. In this case after 10:55 at morning. sensor_task = TimeSensor(task_id="sensor_task", target_time=time(10, 55, 1, 1), dag=dag) post_sensor_task = BashOperator( task_id="post_sensor_task", bash_command='echo "I am done, it means sensor has done its job."', dag=dag ) # Setting up the correct dependencies for defined tasks. sensor_task.set_upstream(bash_task) post_sensor_task.set_upstream(sensor_task)
'mansortdir': MANSORT_DIR}, dag=dag) email_me = EmailOperator( task_id='email_me', to=default_args['email'], subject='%s is complete' % dag_id, html_content='You may now manually sort on NIAO', dag=dag) slack_it = SlackAPIPostOperator( task_id='slack_it', token=SLACK_TOKEN, text='%s is complete' % dag_id, channel='#ephys', dag=dag) make_kwd_task.set_upstream(make_klusta_dir_task) phy_task.set_upstream(make_kwd_task) #merge_events_task.set_upstream(phy_task) clear_phy_task.set_upstream(phy_task) make_kwik_bak_dir_task.set_upstream(phy_task) mv_kwik_bak_task.set_upstream(make_kwik_bak_dir_task) #rsync_task.set_upstream(merge_events_task) rsync_task.set_upstream(clear_phy_task) rsync_task.set_upstream(mv_kwik_bak_task) email_me.set_upstream(rsync_task) slack_it.set_upstream(rsync_task) globals()[dag_id] = dag
bash_command=create_command7, dag=dag) # drop and create BigQuery table if exists create_command8 = 'su {{ params.username }} -c "/home/hduser/dba/bin/python/etl_python_oracle_to_aerospike_and_GCP.ksh -O 5 > "/d4T/hduser/airflow/run_logs/t8_etl_python_oracle_to_aerospike_and_GCP_drop_and_create_BigQuery_table"_"`date +%Y%m%d_%H%M`"."log" 2>&1 "' t8 = BashOperator(task_id='t8_drop_and_create_BigQuery_table_if_exists', bash_command=create_command8, dag=dag) # Load data into BigQuery table from csv file create_command9 = 'su {{ params.username }} -c "/home/hduser/dba/bin/python/etl_python_oracle_to_aerospike_and_GCP.ksh -O 6 > "/d4T/hduser/airflow/run_logs/t9_etl_python_oracle_to_aerospike_and_GCP_load_data_into_BigQuery_table_from_CSV_file"_"`date +%Y%m%d_%H%M`"."log" 2>&1 "' t9 = BashOperator(task_id='t9_Load_data_into_BigQuery_table_from_csv_file', bash_command=create_command9, dag=dag) # Read data from BigQuery table create_command10 = 'su {{ params.username }} -c "/home/hduser/dba/bin/python/etl_python_oracle_to_aerospike_and_GCP.ksh -O 7 > "/d4T/hduser/airflow/run_logs/t10_etl_python_oracle_to_aerospike_and_GCP_Read_data_from_BigQuery_table"_"`date +%Y%m%d_%H%M`"."log" 2>&1 "' t10 = BashOperator(task_id='t10_Read_data_from_BigQuery_table', bash_command=create_command10, dag=dag) t2.set_upstream(t1) t3.set_upstream(t2) t4.set_upstream(t1) t5.set_upstream(t4) t6.set_upstream(t5) t7.set_upstream(t1) t8.set_upstream(t7) t9.set_upstream(t8) t10.set_upstream(t9)
'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG('HelloWorld4', default_args=default_args) # t1, t2, t3 and t4 are examples of tasks created using operators t1 = BashOperator(task_id='task_1', bash_command='echo "Hello World from Task 1"', dag=dag) t2 = BashOperator(task_id='task_2', bash_command='echo "Hello World from Task 2"', dag=dag) t3 = BashOperator(task_id='task_3', bash_command='echo "Hello World from Task 3"', dag=dag) t4 = BashOperator(task_id='task_4', bash_command='echo "Hello World from Task 4"', dag=dag) t2.set_upstream(t1) t3.set_upstream(t1) t4.set_upstream(t2) t4.set_upstream(t3)
default_args = { 'owner': 'Samarth', 'start_date': datetime(2016, 03, 15, 12), } # "schedule_interval" is your cron expression you can write any cron expression like unix cron. dag = DAG('airflow_task_with_hdfs_sensor', default_args=default_args, schedule_interval="1 * * * *") bash_task = BashOperator( task_id='dependency_for_hdfs_sensor', bash_command='echo "HDFS sensor would only be enabled after I am done!"', dag=dag) # Sensor operator takes "filepath" to check if this file is present in hdfs or not. # "hdfs_conn_id" is configured in ui Admin--> Connection. hdfs_sensor_task = HdfsSensor( task_id='hdfs_sensor_task', filepath='/user/samarthg/input2', hdfs_conn_id='webhdfs_default', dag=dag) post_hdfs_sensor_task = BashOperator( task_id='post_hdfs_sensor_task', bash_command='echo "I am done, it means sensor has done its job."', dag=dag) # Setting up the correct dependencies for defined tasks. hdfs_sensor_task.set_upstream(bash_task) post_hdfs_sensor_task.set_upstream(hdfs_sensor_task)
python_callable=CheckReadLogs(), dag=dag) put_file = PythonOperator( task_id='put-file-to-s3', python_callable=DataPutter(), dag=dag) delete_object = PythonOperator( task_id='delete-object-from-s3', python_callable=DeleteObject(), dag=dag) cleanup = BashOperator( task_id='cleanup', bash_command=rm_file, trigger_rule=TriggerRule.ALL_DONE, dag=dag) get_file.set_upstream(put_file) hello_world_docker_write_logs.set_upstream(get_file) check_read_logs.set_upstream(hello_world_docker_write_logs) cleanup.set_upstream(check_read_logs) cleanup.set_upstream(get_file) delete_object.set_upstream(get_file)
# task_id='rsync', # bash_command=as_user(rsync_command, USER), # params={'klustadir': KLUSTA_DIR, # 'mansortdir': MANSORT_DIR, # 'mansorthost': MANSORT_HOST}, # dag=dag) email_me = EmailOperator( task_id='email_me', to=default_args['email'], subject='%s is complete' % dag_id, html_content='You may now manually sort on NIAO', dag=dag) make_kwd_task.set_upstream(make_klusta_dir_task) phy_task.set_upstream(make_kwd_task) #merge_events_task.set_upstream(phy_task) clear_phy_task.set_upstream(phy_task) make_kwik_bak_dir_task.set_upstream(phy_task) mv_kwik_bak_task.set_upstream(make_kwik_bak_dir_task) # make_mansort_dir_task.set_upstream(phy_task) # rsync_task.set_upstream(clear_phy_task) # rsync_task.set_upstream(mv_kwik_bak_task) # rsync_task.set_upstream(make_mansort_dir_task) # email_me.set_upstream(rsync_task) email_me.set_upstream(mv_kwik_bak_task) email_me.set_upstream(clear_phy_task) globals()[dag_id] = dag
def my_py_command(ds, **kwargs): # Print out the "foo" param passed in via # `airflow test example_passing_params_via_test_command run_this <date> # -tp '{"foo":"bar"}'` if kwargs["test_mode"]: print(" 'foo' was passed in via test={} command : kwargs[params][foo] \ = {}".format( kwargs["test_mode"], kwargs["params"]["foo"]) ) # Print out the value of "miff", passed in below via the Python Operator print(" 'miff' was passed in via task params = {}".format( kwargs["params"]["miff"]) ) return 1 my_templated_command = """ echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} " echo " 'miff was passed in via BashOperator with value {{ params.miff }} " """ run_this = PythonOperator( task_id='run_this', provide_context=True, python_callable=my_py_command, params={"miff":"agg"}, dag=dag) also_run_this = BashOperator( task_id='also_run_this', bash_command=my_templated_command, params={"miff":"agg"}, dag=dag) also_run_this.set_upstream(run_this)
local_dir = "/tmp/" # define the location where you want to store in HDFS hdfs_dir = " /tmp/" for channel in to_channels: file_name = "to_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs", bash_command="HADOOP_USER_NAME=hdfs hadoop fs -put -f " + local_dir + file_name + hdfs_dir + channel + "/", dag=dag) load_to_hdfs.set_upstream(analyze_tweets) load_to_hive = HiveOperator( task_id="load_" + channel + "_to_hive", hql="LOAD DATA INPATH '" + hdfs_dir + channel + "/" + file_name + "' " "INTO TABLE " + channel + " " "PARTITION(dt='" + dt + "')", dag=dag) load_to_hive.set_upstream(load_to_hdfs) load_to_hive.set_downstream(hive_to_mysql) for channel in from_channels: file_name = "from_" + channel + "_" + yesterday.strftime("%Y-%m-%d") + ".csv" load_to_hdfs = BashOperator( task_id="put_" + channel + "_to_hdfs",
params={'rasterdir': RASTER_DIR}, on_success_callback = lambda c: set_perms(c['params']['rasterdir'],default_args['owner']), dag=dag) make_raster_task = BashOperator( task_id='make_rasters', bash_command=make_raster_cmd, env={'PATH': ANACONDA_PATH}, params={'postphydir': POSTPHY_DIR, 'ecanalysispath': ECANALYSIS_PATH, 'rasterdir': RASTER_DIR}, dag=dag) ############ Report Completion email_me = EmailOperator( task_id='email_me', to=default_args['email'], subject='%s is merged' % dag_id, html_content='You may commence analysis.', dag=dag) rsync_task.set_upstream(make_postphy_dir_task) merge_events_task.set_upstream(rsync_task) kwik2pandas_task.set_upstream(merge_events_task) email_me.set_upstream(kwik2pandas_task) make_raster_dir_task.set_upstream(kwik2pandas_task) make_raster_task.set_upstream(make_raster_dir_task) globals()[dag_id] = dag
#Run Camus to pull messages from Kafka into HDFS camus_a = BashOperator( task_id = 'camus_a', bash_command='tasks/run_camus.sh', depends_on_past=1, dag = dag) #Run Spark to sum all historical trades and write to Cassandra trades_batch_a = BashOperator( task_id = 'trades_batch_a', bash_command='tasks/run_trades_batch.sh', depends_on_past=1, dag = dag) #set trades batch after news batch to give it more memory trades_batch_a.set_upstream(camus_a) #Update Cassandra's stream 2 table to include counts from the batch run with all the trades summed from stock_count_rts1, which were the trades that came in since task1_camus started running sum_batch_a_rts2 = BashOperator( task_id = 'sum_batch_a_rts2', bash_command='tasks/sum_batch_rts2.sh', depends_on_past=1, dag = dag) sum_batch_a_rts2.set_upstream(trades_batch_a) #stop streaming of trades while the database is getting updated stop_trade_stream_a = BashOperator( task_id = 'stop_trade_stream_a', bash_command='tasks/stop_trade_stream.sh', depends_on_past=1,
aml_utils = load_source( 'aml_utils', "{pf}/asiamiles_airflow_extensions/utils.py".format( pf=configuration.get('core', 'plugins_folder'))) mod_config = aml_utils.load_config( "{dag_folder}/pros_etl.cfg".format( dag_folder=dirname(realpath(__file__)))) hdfs_home=mod_config['hadoop']['hdfs_home'] copy_rsynced_files_to_hadoop = BashOperator( task_id="copy_rsynced_files_to_hadoop", bash_command="hadoop fs -put -f /data1/staging/pros/* pros", dag=dag) #spark-shell --master yarn-client update_seat_idx = BashOperator( task_id="update_seat_idx", bash_command="cat /data1/airflow/dags/pros-etl/pros_seat_index_hist_load.scala | spark-shell --master yarn-client", dag=dag) update_curve = BashOperator( task_id="update_curve", bash_command="cat /data1/airflow/dags/pros-etl/pros_bid_price_hist_load.scala | spark-shell --master yarn-client", dag=dag) update_seat_idx.set_upstream(copy_rsynced_files_to_hadoop) update_curve.set_upstream(copy_rsynced_files_to_hadoop)
'mansortdir': MANSORT_DIR }, dag=dag) email_me = EmailOperator( task_id='email_me', to=default_args['email'], subject='%s is complete' % dag_id, html_content='You may now manually sort on NIAO', dag=dag) slack_it = SlackAPIPostOperator(task_id='slack_it', token=SLACK_TOKEN, text='%s is complete' % dag_id, channel='#ephys', dag=dag) make_kwd_task.set_upstream(make_klusta_dir_task) phy_task.set_upstream(make_kwd_task) #merge_events_task.set_upstream(phy_task) clear_phy_task.set_upstream(phy_task) make_kwik_bak_dir_task.set_upstream(phy_task) mv_kwik_bak_task.set_upstream(make_kwik_bak_dir_task) #rsync_task.set_upstream(merge_events_task) rsync_task.set_upstream(clear_phy_task) rsync_task.set_upstream(mv_kwik_bak_task) email_me.set_upstream(rsync_task) slack_it.set_upstream(rsync_task) globals()[dag_id] = dag
# 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('dag', default_args=default_args) # t1, t2 and t3 are examples of tasks created by instatiating operators requirements = BashOperator( task_id='requirements', bash_command='pip install -r requirements.txt', dag=dag) data = BashOperator( task_id='data', bash_command='python src/make_dataset.py', dag=dag) clean = BashOperator( task_id='clean', bash_command='find . -name "*.pyc" -exec rm {} \;', dag=dag) lint = BashOperator( task_id='flake8', bash_command='flake8 .', dag=dag) data.set_upstream(requirements)
dag=dag) t1bis = BashOperator( task_id='send_to_linkproxy_weekly_catalog', bash_command= 'su datamanufactory -c "cd /srv/datamanufactory/data-workflow/ && /anaconda3/bin/python 1bis_send_catalog_to_linkproxy.py run"', dag=dag) t2 = BashOperator(task_id='wait_webhook_to_hook', bash_command='su datamanufactory -c "sleep 9200"', dag=dag) t3 = BashOperator( task_id='csv_detective_analysis', bash_command= 'su datamanufactory -c "source /anaconda3/etc/profile.d/conda.sh && cd /srv/datamanufactory/data-workflow/ && conda activate csvdeploy && python 3_csv_detective_analysis.py run"', dag=dag) t4 = BashOperator( task_id='send_metadata_to_elk', bash_command= 'su datamanufactory -c "cd /srv/datamanufactory/data-workflow/ && /anaconda3/bin/python 4_ingest_elk.py"', dag=dag) t1.set_upstream(t0) t1bis.set_upstream(t0) t2.set_upstream(t1) t2.set_upstream(t1bis) t3.set_upstream(t2) t4.set_upstream(t3)
def poke(self, context): element = self.elementName if element not in elementList: return True else: return False default_dag_args = { 'start_date': yesterday, 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': datetime.timedelta(minutes=5), } with models.DAG('CheckListSensor', schedule_interval=datetime.timedelta(days=1), default_args=default_dag_args) as dag: createBucket = BashOperator(task_id='yes_it_exists', bash_command='echo DONE', dag=dag) IsBucketExists = ListSensor(task_id='isElementExist', elementName=10, dag=dag) createBucket.set_upstream(IsBucketExists)
} dag = DAG('jarvisworkflow', default_args=default_args) t1 = BashOperator(task_id='train', bash_command='python3 ' + abspath + '/crawler.py tr', dag=dag) t2 = BashOperator(task_id='test', bash_command='python3 ' + abspath + '/crawler.py te', dag=dag) t3 = BashOperator(task_id='cleantr', bash_command='python3 ' + abspath + '/cleaner.py tr', dag=dag) t3.set_upstream(t1) t4 = BashOperator(task_id='cleante', bash_command='python3 ' + abspath + '/cleaner.py te', dag=dag) t4.set_upstream(t2) t5 = BashOperator(task_id='predict', bash_command='python3 ' + abspath + '/predictor.py', dag=dag) t5.set_upstream(t3) t6 = BashOperator(task_id='validate', bash_command='python3 ' + abspath + '/validator.py', dag=dag) t6.set_upstream([t4, t5])
t3 = BashOperator( task_id='cloud-events-usage-load_rawxml', #bash_command='/home/airflow/airflow-jobs/scripts/cloud_events_usage/backup_cloud_usage_events_rawxml.sh;', bash_command=script_folder + 'cloud_events_usage/backup_cloud_usage_events_rawxml.sh;', dag=dag) t4 = BashOperator( task_id='cloud-events-usage-load_rawxml1', #bash_command='/home/airflow/airflow-jobs/scripts/cloud_events_usage/backup_cloud_usage_events_rawxml1.sh;', bash_command=script_folder + 'cloud_events_usage/backup_cloud_usage_events_rawxml1.sh;', dag=dag) t5 = BashOperator( task_id='cloud-events-usage-load_glance_and_nova', #bash_command='/home/airflow/airflow-jobs/scripts/cloud_events_usage/backup_glance_and_nova.sh;', bash_command=script_folder + 'cloud_events_usage/backup_glance_and_nova.sh;', dag=dag) t6 = BashOperator( task_id='cloud-events-usage-verify-load', #bash_command='/home/airflow/airflow-jobs/scripts/cloud_events_usage/checkDailyLoad.sh;', bash_command=script_folder + 'ods_archiving/checkDailyLoad.sh cloud_usage_events;', dag=dag, trigger_rule=TR.ALL_DONE) t6.set_upstream(t1) t6.set_upstream(t2) t6.set_upstream(t3) t6.set_upstream(t4) t6.set_upstream(t5)
bash_command='./sqoop_incremental.sh', dag=dag ) # merge the data from Mysql table to HDFS task2 = BashOperator( task_id= 'sqoop_merge_import', bash_command='./sqoop_merge.sh', dag=dag ) # Inserting the data from Hive external table to the target table task3 = HiveOperator( task_id= 'hive_insert', hql='LOAD DATA INPATH "/user/cloudera/employeeprofilemerge" OVERWRITE INTO TABLE employee_profile;', depends_on_past=True, dag=dag ) # Inserting the data from Hive table with masked ssn external table to the target table task4 = HiveOperator( task_id= 'hive_insert_masked', hql='add jar /home/cloudera/Masking.jar;create TEMPORARY function masking as \'Masking\';INSERT OVERWRITE table employee_profile_masked SELECT profile_id,first_name,last_name,modified_date,masking(ssn) FROM employee_profile;', depends_on_past=True, dag=dag ) # defining the job dependency task2.set_upstream(task1) task3.set_upstream(task2) task4.set_upstream(task3)
'tutorial_mod', default_args=default_args, schedule_interval=timedelta(1)) # t1, t2 and t3 are examples of tasks created by instatiating operators t1 = BashOperator( task_id='print_date', bash_command='date', dag=dag) t2 = BashOperator( task_id='sleep', bash_command='sleep 5', retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator( task_id='templated', bash_command=templated_command, params={'my_param': 'Parameter I passed in'}, dag=dag) t2.set_upstream(t1) t3.set_upstream(t1)
schedule_interval=timedelta(2)) #Run Camus to pull messages from Kafka into HDFS camus_a = BashOperator(task_id='camus_a', bash_command='tasks/run_camus.sh', depends_on_past=1, dag=dag) #Run Spark to sum all historical trades and write to Cassandra trades_batch_a = BashOperator(task_id='trades_batch_a', bash_command='tasks/run_trades_batch.sh', depends_on_past=1, dag=dag) #set trades batch after news batch to give it more memory trades_batch_a.set_upstream(camus_a) #Update Cassandra's stream 2 table to include counts from the batch run with all the trades summed from stock_count_rts1, which were the trades that came in since task1_camus started running sum_batch_a_rts2 = BashOperator(task_id='sum_batch_a_rts2', bash_command='tasks/sum_batch_rts2.sh', depends_on_past=1, dag=dag) sum_batch_a_rts2.set_upstream(trades_batch_a) #stop streaming of trades while the database is getting updated stop_trade_stream_a = BashOperator(task_id='stop_trade_stream_a', bash_command='tasks/stop_trade_stream.sh', depends_on_past=1, dag=dag)
'depends_on_past': False, 'start_date': datetime.combine(today, time(20, 00, 0)) - timedelta(days=1), 'retries': 3, 'retry_delay': timedelta(minutes=5), } TR = TriggerRule dag = DAG('usl_ods_v1', default_args=default_args, schedule_interval=timedelta(days=1)) script_folder = DAGS_FOLDER + '/../scripts/' t0 = BashOperator(task_id='usldb_ods_incremental1', bash_command=script_folder + 'usl_ods/usldb_ods_incremental1.sh;', dag=dag) t1 = BashOperator(task_id='usldb_ods_incremental2', bash_command=script_folder + 'usl_ods/usldb_ods_incremental2.sh;', dag=dag) t2 = BashOperator(task_id='usldb_ods_full_load_all', bash_command=script_folder + 'usl_ods/usldb_ods_full_load_all.sh;', dag=dag) t3 = BashOperator(task_id='verify_load', bash_command=script_folder + 'ods_archiving/checkDailyLoad.sh usldb_ods;', dag=dag, trigger_rule=TR.ALL_DONE) t3.set_upstream(t0) t3.set_upstream(t1) t3.set_upstream(t2)
schedule_interval=schedule_interval, default_args=default_args, max_active_runs=1) dbt_clone = BashOperator( task_id='dbt_clone', bash_command= 'cd ~/project && git fetch --all && git reset --hard origin/master', dag=dag) dbt_deps = BashOperator( task_id='dbt_deps', bash_command='cd ~/project && dbt deps --profile=warehouse --target=prod', dag=dag) dbt_deps.set_upstream(dbt_clone) dbt_clean = BashOperator( task_id='dbt_clean', bash_command='cd ~/project && dbt clean --profile=warehouse --target=prod', dag=dag) dbt_clean.set_upstream(dbt_deps) dbt_archive = BashOperator( task_id='dbt_archive', bash_command= 'cd ~/project && dbt archive --profile=warehouse --target=prod', dag=dag) dbt_archive.set_upstream(dbt_clean)
default_args=default_args, schedule_interval="0 * * * *", start_date=datetime.now() - timedelta(minutes=10)) c = """ sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_parse_process.sh """ c1 = """ sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_ftp_process.sh """ c2 = """ sh $ATNI_REPO/shell_scripts/zteumts/run_zteumts_post_parse_process.sh """ pullFiles = BashOperator(task_id='zteumts--pullFiles', bash_command=c1, dag=dag) parseFiles = BashOperator(task_id='zteumts--parseFiles', bash_command=c, dag=dag) moveFilesToDatalake = BashOperator(task_id='zteumts--moveFilesToDataLake', bash_command=c2, dag=dag) parseFiles.set_upstream(pullFiles) moveFilesToDatalake.set_upstream(parseFiles)
# The contents of the obtained file_path_sp directory file_names_sp = os.listdir(file_path_sp) # Get ul file path file_path_ul = task_name_ul + '/' + option + '/bin/' # The contents of the obtained file_path_ul directory file_names_ul = os.listdir(file_path_ul) # Loop to get sp each file name for file_name_sp in file_names_sp: if os.path.isfile(file_path_sp + file_name_sp): task_option_path = '/usr/bin/perl ' + task_name_sp + '/' + option + '/bin/' + file_name_sp + ' ' task_option_path_ld = '/usr/bin/perl ' + task_name_ld + '/' + option + '/bin/StructuralLoad.pl ' t = BashOperator(task_id='LD_' + option, bash_command=task_option_path_ld, dag=dag) t.set_upstream(branching) dummy_follow = BashOperator(task_id='SP_' + option, bash_command=task_option_path, dag=dag) t.set_downstream(dummy_follow) dummy_follow.set_downstream(join) # Loop to get ul each file name for file_name_ul in file_names_ul: if os.path.isfile(file_path_ul + file_name_ul): task_option_path = '/usr/bin/perl ' + task_name_ul + '/' + option + '/bin/' + file_name_ul + ' ' dummy_follow = BashOperator(task_id='UL_' + option, bash_command=task_option_path, dag=dag) dummy_follow.set_upstream(branching) dummy_follow.set_downstream(join)
# rsync_task = BashOperator( # task_id='rsync', # bash_command=as_user(rsync_command, USER), # params={'klustadir': KLUSTA_DIR, # 'mansortdir': MANSORT_DIR, # 'mansorthost': MANSORT_HOST}, # dag=dag) email_me = EmailOperator( task_id='email_me', to=default_args['email'], subject='%s is complete' % dag_id, html_content='You may now manually sort on NIAO', dag=dag) make_kwd_task.set_upstream(make_klusta_dir_task) phy_task.set_upstream(make_kwd_task) #merge_events_task.set_upstream(phy_task) clear_phy_task.set_upstream(phy_task) make_kwik_bak_dir_task.set_upstream(phy_task) mv_kwik_bak_task.set_upstream(make_kwik_bak_dir_task) # make_mansort_dir_task.set_upstream(phy_task) # rsync_task.set_upstream(clear_phy_task) # rsync_task.set_upstream(mv_kwik_bak_task) # rsync_task.set_upstream(make_mansort_dir_task) # email_me.set_upstream(rsync_task) email_me.set_upstream(mv_kwik_bak_task) email_me.set_upstream(clear_phy_task) globals()[dag_id] = dag
'owner': 'airflow', 'depends_on_past': False, 'retries': 0, 'start_date': datetime(2019, 1, 01, 0, 0), 'retry_delay': datetime.timedelta(minutes=5) # airflow will run next quarter } dag = DAG('onemortgage_monthly', default_args=airflow_args, schedule_interval='@monthly') now = datetime.now() download_fannie = BashOperator( task_id='download_fannie', bash_command="/home/ubuntu/OneMortgage/src/bash/download_fannie.sh", dag=dag) download_freddie = BashOperator( task_id='download_freddie', bash_command='/home/ubuntu/OneMortgage/src/bash/download_freddie.sh', dag=dag) process_data = BashOperator( task_id='process_data', bash_command='/home/ubuntu/OneMortgage/src/bash/run_batch.sh', dag=dag) process_data.set_upstream([download_freddie, download_fannie])
task_id='ods_load_batch_1', bash_command=script_folder + 'operational_reporting_location_manager/ods_load_batch_1.sh;', dag=dag) t2 = BashOperator( task_id='ods_load_batch_2', bash_command=script_folder + 'operational_reporting_location_manager/ods_load_batch_2.sh;', dag=dag) t3 = BashOperator( task_id='ods_load_batch_3', bash_command=script_folder + 'operational_reporting_location_manager/ods_load_batch_3.sh;', dag=dag) t4 = BashOperator( task_id='ods_load_batch_4', bash_command=script_folder + 'operational_reporting_location_manager/ods_load_batch_4.sh;', dag=dag) t5 = BashOperator( task_id='verify_load', bash_command=script_folder + 'ods_archiving/checkDailyLoad.sh operational_reporting_location_manager;', dag=dag, trigger_rule=TR.ALL_DONE) t5.set_upstream(t0) t5.set_upstream(t1) t5.set_upstream(t2) t5.set_upstream(t3) t5.set_upstream(t4)
params={'rasterdir': RASTER_DIR}, on_success_callback=lambda c: set_perms(c['params']['rasterdir'], default_args['owner']), dag=dag) make_raster_task = BashOperator(task_id='make_rasters', bash_command=make_raster_cmd, env={'PATH': ANACONDA_PATH}, params={ 'postphydir': POSTPHY_DIR, 'ecanalysispath': ECANALYSIS_PATH, 'rasterdir': RASTER_DIR }, dag=dag) ############ Report Completion email_me = EmailOperator(task_id='email_me', to=default_args['email'], subject='%s is merged' % dag_id, html_content='You may commence analysis.', dag=dag) rsync_task.set_upstream(make_postphy_dir_task) merge_events_task.set_upstream(rsync_task) kwik2pandas_task.set_upstream(merge_events_task) email_me.set_upstream(kwik2pandas_task) make_raster_dir_task.set_upstream(kwik2pandas_task) make_raster_task.set_upstream(make_raster_dir_task) globals()[dag_id] = dag
'start_date': datetime(2016, 4, 29), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 2, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('dag', default_args=default_args) # t1, t2 and t3 are examples of tasks created by instatiating operators requirements = BashOperator(task_id='requirements', bash_command='pip install -r requirements.txt', dag=dag) data = BashOperator(task_id='data', bash_command='python src/make_dataset.py', dag=dag) clean = BashOperator(task_id='clean', bash_command='find . -name "*.pyc" -exec rm {} \;', dag=dag) lint = BashOperator(task_id='flake8', bash_command='flake8 .', dag=dag) data.set_upstream(requirements)
} dag = DAG('ldp', schedule_interval='0 10 * * *', default_args=default_args) # -------------- gpsNo, deviceNo 对照 begin -------------- gps_contrast = BashOperator( task_id='gpsnoContrastJob', bash_command='echo "nvr gpsNo deviceNo contrast..." | sleep 5', dag=dag) gps_contrast_kmx = BashOperator( task_id='gpsContrastJob', bash_command='echo "nvr gpsNo template by kmx..." | sleep 5', dag=dag) gps_contrast_kmx.set_upstream(gps_contrast) # -------------- gpsNo, deviceNo 对照 end -------------- # -------------- ldp 数据 begin -------------- chan_net_list = BashOperator( task_id='lgChannetListJob', bash_command='echo "ldp chan_net_list..." | sleep 5', dag=dag) chan_net_list.set_upstream(gps_contrast_kmx) city_list = BashOperator(task_id='lgCityListJob', bash_command='echo "ldp city_list..." | sleep 5', dag=dag) city_list.set_upstream(gps_contrast_kmx)
default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2016, 11, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 5, 'retry_delay': timedelta(minutes=5) } dag = DAG('s3_dag_test', default_args=default_args, schedule_interval= '@once') t1 = BashOperator( task_id='bash_test', bash_command='echo "hello world" > s3_conn_test.txt', dag=dag) sensor = S3KeySensor( task_id='check_s3_for_file_in_s3', bucket_key='*', wildcard_match=True, bucket_name='airflow-input-sprite', s3_conn_id='aws_default', timeout=18*60*60, poke_interval=120, dag=dag) t1.set_upstream(sensor)
def my_py_command(ds, **kwargs): # Print out the "foo" param passed in via # `airflow test example_passing_params_via_test_command run_this <date> # -tp '{"foo":"bar"}'` if kwargs["test_mode"]: print(" 'foo' was passed in via test={} command : kwargs[params][foo] \ = {}".format(kwargs["test_mode"], kwargs["params"]["foo"])) # Print out the value of "miff", passed in below via the Python Operator print(" 'miff' was passed in via task params = {}".format( kwargs["params"]["miff"])) return 1 my_templated_command = """ echo " 'foo was passed in via Airflow CLI Test command with value {{ params.foo }} " echo " 'miff was passed in via BashOperator with value {{ params.miff }} " """ run_this = PythonOperator(task_id='run_this', provide_context=True, python_callable=my_py_command, params={"miff": "agg"}, dag=dag) also_run_this = BashOperator(task_id='also_run_this', bash_command=my_templated_command, params={"miff": "agg"}, dag=dag) also_run_this.set_upstream(run_this)
dag=dag) t3 = BashOperator( task_id='check_running_stat_job', bash_command='echo "Task 3"', dag=dag) t4 = BashOperator( task_id='spark_etl', bash_command='exit 123"', dag=dag) t5 = BashOperator( task_id='update_orc_file_variable', bash_command='echo "Task 5"', dag=dag) t6 = BashOperator( task_id='remove_conf_file', bash_command='echo "Task 6"', dag=dag, trigger_rule='all_done') t2.set_upstream(t1) t3.set_upstream(t2) t4.set_upstream(t3) t5.set_upstream(t4) t6.set_upstream(t2) t6.set_upstream(t3) t6.set_upstream(t4) t6.set_upstream(t5)
def export_to_gcp_dag(_sub_dag_id, _schedule_interval, _queue, _default_args, _export_hql_dict, _params, _dataproc_config): dag = DAG(dag_id=_sub_dag_id, schedule_interval=_schedule_interval, default_args=_default_args) export_data_hql = _export_hql_dict["export_data_hql"] add_dataproc_partition_hql = _export_hql_dict["add_dataproc_partition_hql"] drop_tmp_table_hql = _export_hql_dict["drop_tmp_table_hql"] def gen_date_str_nodash(execution_date, days_delta=0, hours_delta=0): from pytz import timezone, utc from datetime import datetime, timedelta pacific_timezone = "US/Pacific" date = utc.localize(execution_date) date = date.astimezone(timezone(pacific_timezone)) if days_delta: date += timedelta(days=days_delta) if hours_delta: date += timedelta(hours=hours_delta) return date.strftime("%Y%m%d") _params.update({"gen_date_str_nodash": gen_date_str_nodash}) export_data = HiveOperator(task_id="export_data", hql=export_data_hql, params=_params, queue=_queue, dag=dag) add_dataproc_partition = DataProcHiveOperator( task_id="add_dataproc_partition", dataproc_cluster=_dataproc_config["dataproc_cluster"], region=_dataproc_config["region"], gcp_conn_id=_dataproc_config["gcp_conn_id"], query=add_dataproc_partition_hql, params=_params, queue=_queue, dag=dag) drop_tmp_table = HiveOperator(task_id="drop_tmp_table", hql=drop_tmp_table_hql, params=_params, queue=_queue, dag=dag) add_dataproc_partition.set_upstream(export_data) drop_tmp_table.set_upstream(add_dataproc_partition) if _params.get("stamp_file_path", None) is not None: gcp_conf_true = 'google.cloud.auth.service.account.enable=true' gcp_conf_keyfile = 'google.cloud.auth.service.account.json.keyfile={{ params.gcp_keyfile }}' add_success_stamp_file = BashOperator( task_id="add_success_stamp_file", bash_command="hadoop fs -D " + gcp_conf_true + " -D " + gcp_conf_keyfile + " -touchz " + _params.get("stamp_file_path", None), params=_params, queue=_queue, dag=dag) add_success_stamp_file.set_upstream(drop_tmp_table) return dag
# tasks. trigger_rule=TriggerRule.ALL_DONE) # Delete gcs files in the timestamped transformed folder. delete_transformed_files = BashOperator( task_id='delete_transformed_files', bash_command="gsutil -m rm -r gs://{{ var.value.gcs_bucket }}" + "/{{ dag_run.conf['transformed_path'] }}/") # If the spark job or BQ Load fails we rename the timestamped raw path to # a timestamped failed path. move_failed_files = BashOperator( task_id='move_failed_files', bash_command="gsutil mv gs://{{ var.value.gcs_bucket }}" + "/{{ dag_run.conf['raw_path'] }}/ " + "gs://{{ var.value.gcs_bucket}}" + "/{{ dag_run.conf['failed_path'] }}/", trigger_rule=TriggerRule.ONE_FAILED) # Set the dag property of the first Operators, this will be inherited by # downstream Operators. create_cluster.dag = dag create_cluster.set_downstream(submit_pyspark) submit_pyspark.set_downstream([delete_cluster, bq_load]) bq_load.set_downstream(delete_transformed_files) move_failed_files.set_upstream([bq_load, submit_pyspark])
] # copy table to bi #bitables = ['hardware', 'hardwareios'] bitables = [] for table in tables: imp = BashOperator( task_id='import_' + table, bash_command= '/disk1/bdl/etl/ETL/imp_mongo_doc_with_date_input.sh {table} {begin} {end} > /disk1/bdl/etl/ETL/log/{table}.log ' .format(table=table, begin='{{ ds }}', end='{{ tomorrow_ds }}'), dag=dag) if table in bitables: bimp = BashOperator( task_id='send_2_bi_' + table, bash_command= '/disk1/bdl/etl/ETL/send_bi_impala_with_date_input.sh {table} {begin} {end} > /disk1/bdl/etl/ETL/log/BI/{table}.log ' .format(table=table, begin='{{ ds }}', end='{{ tomorrow_ds }}'), dag=dag) bimp.set_upstream(imp) esucc.set_upstream(bimp) else: esucc.set_upstream(imp) imp_software = BashOperator( task_id='import_software', bash_command= '/disk1/bdl/etl/ETL/imp_software_doc_with_date_input.sh {{ ds }} {{ tomorrow_ds }} > /disk1/bdl/etl/ETL/log/software.log ', dag=dag) esucc.set_upstream(imp_software)
Documentation that goes along with the Airflow tutorial located [here](http://pythonhosted.org/airflow/tutorial.html) """ from airflow import DAG from airflow.operators import BashOperator from datetime import datetime, timedelta from settings import default_args dag = DAG('reddit_comments', default_args=default_args, schedule_interval=timedelta(2)) pre = BashOperator( task_id='setup', bash_command='tasks/setup_dirs.sh', depends_on_past=False, dag=dag) t1 = BashOperator( task_id='ngrams_batch', bash_command='tasks/run_ngrams_batch.sh', depends_on_past=False, dag=dag) t2 = BashOperator( task_id='ngrams_optimize', depends_on_past=True, bash_command='tasks/optimize_ngrams.sh', dag=dag) t1.set_upstream(pre) t2.set_upstream(t1)
} # Set concurrency and max_active_runs to 1, preventing more than one dag instance # from being created. dag = DAG(dag_name, default_args=task_args, concurrency=1, max_active_runs=1, schedule_interval=schedule_interval) get_env = PythonOperator( task_id='get-config-from-s3', python_callable=ConfigGetter(), dag=dag) set_variables = PythonOperator( task_id='set-variables', python_callable=BootStrapper(), dag=dag) cleanup = BashOperator( task_id='cleanup', bash_command=rm_config, trigger_rule='all_done', dag=dag) set_variables.set_upstream(get_env) cleanup.set_upstream(set_variables)
n = n + '_' dic[n] = c return dic command_dict = dict_from_cmd_list(bash_commands) tasks = [] for n,c in command_dict.iteritems(): task = BashOperator(task_id=n, bash_command=c, dag=dag, pool='default') if len(tasks) > 0: task.set_upstream(tasks[-1]) tasks.append(task) job = DagExecutionJob(dag) # def push(**kwargs): # # pushes an XCom without a specific target # kwargs['ti'].xcom_push(key='value from pusher 1', value=value_1) # def push_by_returning(**kwargs):
#import the required libraries from airflow import DAG from datetime import datetime, timedelta from airflow.operators import BashOperator #defining the default arguments dictionary args = { 'owner': 'airflow', 'start_date': datetime(2020, 12, 2), #you can change this start_date 'retries': 1, "retry_delay": timedelta(seconds=10), } dag = DAG('Assignment_1', default_args=args) #task1 is to create a directory 'test_dir' inside dags folder task1 = BashOperator(task_id='create_directory', bash_command='mkdir -p ~/outputs/test_dir', dag=dag) #task2 is to get the 'shasum' of 'test_dir' directory task2 = BashOperator(task_id='get_shasum', bash_command='shasum ~/outputs/test_dir', dag=dag) #below we are setting up the operator relationships such that task1 will run first than task2 task2.set_upstream(task1)
} dag = DAG('monitoring_etl', default_args=default_args, schedule_interval=timedelta(days=1)) mad = BashOperator(task_id='maas_accounts_download', bash_command=RUN_AS, params={'cmd': 'cron-mad.sh'}, dag=dag) mal = BashOperator(task_id='maas_accounts_load', bash_command=RUN_AS, params={'cmd': 'cron-mal.sh'}, dag=dag) mal.set_upstream(mad) mmd = BashOperator(task_id='maas_metrics_download', bash_command=RUN_AS, params={ 'cmd': 'download_from_cf.sh', 'path': '/home/maas/maas_report' }, dag=dag) mmd.set_upstream(mal) mml = BashOperator(task_id='maas_metrics_load', bash_command=RUN_AS, params={'cmd': 'cron-mml.sh'}, dag=dag) mml.set_upstream(mmd)
branching = BranchPythonOperator(task_id='branching', python_callable=lambda: 'source_count' if datetime.now().day <= 7 and datetime.today( ).weekday() == 6 else 'ignore_not_sunday', dag=dag) branching.set_upstream(run_this_first) esucc = EmailOperator(task_id='email_success_' + dag.dag_id, to=email_addr, subject=dag.dag_id + ' [success] on ' + datetime.now().strftime('%Y-%m-%d'), html_content='Congratulation!', trigger_rule='all_success', dag=dag) source_count = BashOperator( task_id='source_count', bash_command='/disk1/source_data_count; ./daily_table_count.sh > out.log ', dag=dag) source_count.set_upstream(branching) esucc.set_upstream(source_count) ignore_not_sunday = DummyOperator(task_id='ignore_not_sunday', dag=dag) ignore_not_sunday.set_upstream(branching) join = DummyOperator(task_id='join', trigger_rule='all_success', dag=dag) join << ignore_not_sunday join << esucc
'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('tutorial', default_args=default_args) # t1, t2 and t3 are examples of tasks created by instatiating operators t1 = BashOperator(task_id='print_date', bash_command='date', dag=dag) t2 = BashOperator(task_id='sleep', bash_command='sleep 5', retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator(task_id='templated', bash_command=templated_command, params={'my_param': 'Parameter I passed in'}, dag=dag) t2.set_upstream(t1) t3.set_upstream(t1)
from airflow.operators import BashOperator, MySqlOperator from airflow.models import DAG from datetime import datetime default_args = { 'owner': 'max', 'start_date': datetime(2014, 9, 1), 'mysql_dbid': 'local_mysql', } dag = DAG(dag_id='example_3') run_this = BashOperator( task_id='also_run_this', bash_command='ls -l', **default_args) dag.add_task(run_this) for i in range(5): i = str(i) task = BashOperator( task_id='runme_'+i, bash_command='sleep {{ 10 + macros.random() * 10 }}', **default_args) task.set_upstream(run_this) dag.add_task(task)