def create_test_pipeline(suffix, trigger_rule, dag): skip_operator = DummySkipOperator(task_id='skip_operator_{}'.format(suffix), dag=dag) always_true = DummyOperator(task_id='always_true_{}'.format(suffix), dag=dag) join = DummyOperator(task_id=trigger_rule, dag=dag, trigger_rule=trigger_rule) join.set_upstream(skip_operator) join.set_upstream(always_true) final = DummyOperator(task_id='final_{}'.format(suffix), dag=dag) final.set_upstream(join)
dag2_task1 = DummyOperator( task_id='test_dop_task', dag=dag2, depends_on_past=True, ) # DAG tests that a Dag run that doesn't complete is marked failed dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args) dag3_task1 = PythonOperator(task_id='test_dagrun_fail', dag=dag3, python_callable=fail) dag3_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag3, ) dag3_task2.set_upstream(dag3_task1) # DAG tests that a Dag run that completes but has a failure is marked success dag4 = DAG(dag_id='test_dagrun_states_success', default_args=default_args) dag4_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag4, python_callable=fail, ) dag4_task2 = DummyOperator(task_id='test_dagrun_succeed', dag=dag4, trigger_rule=TriggerRule.ALL_FAILED) dag4_task2.set_upstream(dag4_task1) # DAG tests that a Dag run that completes but has a root failure is marked fail dag5 = DAG(dag_id='test_dagrun_states_root_fail', default_args=default_args)
from airflow.operators import BranchPythonOperator, DummyOperator from airflow.models import DAG from datetime import datetime, timedelta import random seven_days_ago = datetime.combine(datetime.today() - timedelta(7), datetime.min.time()) args = { 'owner': 'airflow', 'start_date': seven_days_ago, } dag = DAG(dag_id='example_branch_operator', default_args=args) cmd = 'ls -l' run_this_first = DummyOperator(task_id='run_this_first', dag=dag) options = ['branch_a', 'branch_b', 'branch_c', 'branch_d'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(run_this_first) for option in options: t = DummyOperator(task_id=option, dag=dag) t.set_upstream(branching) dummy_follow = DummyOperator(task_id='follow_' + option, dag=dag) t.set_downstream(dummy_follow)
def generate_dag(area, download_dir, default_args): """Generate Landsat8 ingestion DAGs. Parameters ---------- area: Landsat8Area Configuration parameters for the Landsat8 area to be downloaded default_args: dict Default arguments for all tasks in the DAG. """ dag = DAG( LANDSAT8.id + "_{}".format(area.name), description="DAG for downloading, processing and ingesting {} AOI in Landsat8 data " "from scene_list".format(area.name), default_args=default_args, dagrun_timeout=LANDSAT8.dagrun_timeout, schedule_interval=LANDSAT8.dag_schedule_interval, catchup=LANDSAT8.catchup, params={ "area": area, } ) search_task = Landsat8SearchOperator( task_id='search_{}'.format(area.name), area=area, cloud_coverage=LANDSAT8.cloud_coverage, startdate = LANDSAT8.startdate, enddate = LANDSAT8.enddate, filter_max =LANDSAT8.filter_max, order_by =LANDSAT8.order_by, order_type =LANDSAT8.order_type, db_credentials= CFG.landsat8_postgresql_credentials, dag=dag ) generate_html_description = Landsat8ProductDescriptionOperator( task_id='generate_html_description', description_template=os.path.join( TEMPLATES_PATH, "product_abstract.html"), download_dir=download_dir, dag=dag ) download_thumbnail = Landsat8DownloadOperator( task_id="download_thumbnail", download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="thumb_small.jpg", download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) generate_thumbnail = Landsat8ThumbnailOperator( task_id='generate_thumbnail', get_inputs_from=download_thumbnail.task_id, thumb_size_x="64", thumb_size_y="64", dag=dag ) download_metadata = Landsat8DownloadOperator( task_id="download_metadata", download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="MTL.txt", download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) join_task = DummyOperator( task_id='landsat8_join', dag=dag ) download_tasks = [] translate_tasks = [] addo_tasks = [] upload_tasks = [] gdalinfo_tasks = [] for band in area.bands: download_band = Landsat8DownloadOperator( task_id="download_band{}".format(band), download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="B{}.TIF".format(band), download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) download_tasks.append(download_band) translate = GDALTranslateOperator( task_id="translate_band{}".format(band), get_inputs_from=download_band.task_id, dag=dag ) translate_tasks.append(translate) addo = GDALAddoOperator( task_id="add_overviews_band{}".format(band), get_inputs_from=translate.task_id, resampling_method="average", max_overview_level=128, compress_overview="PACKBITS", dag=dag ) addo_tasks.append(addo) gdalinfo = GDALInfoOperator( task_id='landsat8_gdalinfo_band_{}'.format(band), get_inputs_from=addo.task_id, dag=dag ) gdalinfo_tasks.append(gdalinfo) upload = RSYNCOperator( task_id="upload_band{}".format(band), host=CFG.rsync_hostname, remote_usr=CFG.rsync_username, ssh_key_file=CFG.rsync_ssh_key, remote_dir=LANDSAT8.repository_dir, get_inputs_from=addo.task_id, dag=dag) upload_tasks.append(upload) download_band.set_upstream(search_task) translate.set_upstream(download_band) addo.set_upstream(translate) gdalinfo.set_upstream(addo) upload.set_upstream(addo) join_task.set_upstream(upload) join_task.set_upstream(gdalinfo) download_task_ids = ( task.task_id for task in download_tasks ) create_original_package_task = PythonOperator(task_id="create_original_package", python_callable=create_original_package, op_kwargs={ 'get_inputs_from': { "search_task_id" : search_task.task_id, "download_task_ids" : download_task_ids, } , 'out_dir' : LANDSAT8.process_dir }, dag=dag) upload_original_package_task = RSYNCOperator( task_id="upload_original_package", host=CFG.rsync_hostname, remote_usr=CFG.rsync_username, ssh_key_file=CFG.rsync_ssh_key, remote_dir=LANDSAT8.original_package_upload_dir, get_inputs_from=create_original_package_task.task_id, dag=dag) # we only neeed gdalinfo output on one of the granules gdalinfo_task = gdalinfo_tasks[0] gdalinfo_task_id = gdalinfo_task.task_id upload_task_ids = (task.task_id for task in upload_tasks) generate_metadata = Landsat8MTLReaderOperator( task_id='generate_metadata', original_package_download_base_url = LANDSAT8.original_package_download_base_url, gs_workspace = LANDSAT8.geoserver_workspace, gs_wms_layer = LANDSAT8.geoserver_layer, gs_wms_width = LANDSAT8.geoserver_oseo_wms_width, gs_wms_height = LANDSAT8.geoserver_oseo_wms_height, gs_wms_format = LANDSAT8.geoserver_oseo_wms_format, gs_wms_version = LANDSAT8.geoserver_oseo_wms_version, gs_wfs_featuretype = LANDSAT8.geoserver_featuretype, gs_wfs_format = LANDSAT8.geoserver_oseo_wfs_format, gs_wfs_version=LANDSAT8.geoserver_oseo_wfs_version, gs_wcs_scale_i = LANDSAT8.geoserver_oseo_wcs_scale_i, gs_wcs_scale_j = LANDSAT8.geoserver_oseo_wcs_scale_j, gs_wcs_format = LANDSAT8.geoserver_oseo_wcs_format, gs_wcs_version = LANDSAT8.geoserver_oseo_wcs_version, gs_wcs_coverage_id = LANDSAT8.geoserver_layer, get_inputs_from={ "search_task_id" : search_task.task_id, "metadata_task_id": download_metadata.task_id, "upload_task_ids" : upload_task_ids, "gdalinfo_task_id": gdalinfo_task_id, "upload_original_package_task_id": upload_original_package_task.task_id, }, metadata_xml_path=os.path.join(TEMPLATES_PATH, "metadata.xml"), dag=dag ) product_zip_task = Landsat8ProductZipFileOperator( task_id='landsat8_product_zip', get_inputs_from=[ generate_html_description.task_id, generate_metadata.task_id, generate_thumbnail.task_id ], output_dir=LANDSAT8.process_dir, dag=dag ) # curl -vvv -u evoadmin:\! -XPOST -H "Content-type: application/zip" --data-binary @/var/data/Sentinel-2/S2_MSI_L1C/download/S2A_MSIL1C_20170909T093031_N0205_R136_T36VUQ_20170909T093032/product.zip "http://ows-oda.eoc.dlr.de/geoserver/rest/oseo/collections/SENTINEL2/products" publish_task = PythonOperator(task_id="publish_product_task", python_callable=publish_product, op_kwargs={ 'geoserver_username': CFG.geoserver_username, 'geoserver_password': CFG.geoserver_password, 'geoserver_rest_endpoint': '{}/oseo/collections/{}/products'.format( CFG.geoserver_rest_url, LANDSAT8.geoserver_oseo_collection), 'get_inputs_from': product_zip_task.task_id, }, dag=dag) download_thumbnail.set_upstream(search_task) download_metadata.set_upstream(search_task) for tid in download_tasks: create_original_package_task.set_upstream(tid) upload_original_package_task.set_upstream(create_original_package_task) generate_metadata.set_upstream(join_task) generate_metadata.set_upstream(download_metadata) generate_metadata.set_upstream(upload_original_package_task) generate_thumbnail.set_upstream(download_thumbnail) generate_html_description.set_upstream(search_task) product_zip_task.set_upstream(generate_html_description) product_zip_task.set_upstream(generate_metadata) product_zip_task.set_upstream(generate_thumbnail) publish_task.set_upstream(upload_original_package_task) publish_task.set_upstream(product_zip_task) return dag
branching = BranchPythonOperator(task_id='branching', python_callable=lambda: 'source_count' if datetime.now().day <= 7 and datetime.today( ).weekday() == 6 else 'ignore_not_sunday', dag=dag) branching.set_upstream(run_this_first) esucc = EmailOperator(task_id='email_success_' + dag.dag_id, to=email_addr, subject=dag.dag_id + ' [success] on ' + datetime.now().strftime('%Y-%m-%d'), html_content='Congratulation!', trigger_rule='all_success', dag=dag) source_count = BashOperator( task_id='source_count', bash_command='/disk1/source_data_count; ./daily_table_count.sh > out.log ', dag=dag) source_count.set_upstream(branching) esucc.set_upstream(source_count) ignore_not_sunday = DummyOperator(task_id='ignore_not_sunday', dag=dag) ignore_not_sunday.set_upstream(branching) join = DummyOperator(task_id='join', trigger_rule='all_success', dag=dag) join << ignore_not_sunday join << esucc
'start_date': datetime(2015, 8, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('etl_daily', start_date=datetime(2016, 05, 01), schedule_interval="0 0 14 * MON-FRI", default_args=default_args) t1 = PythonOperator(task_id='test_airflow', python_callable=test_airflow, dag=dag) t2 = PythonOperator(task_id='daily_equity_price_ingest', python_callable=daily_equity_price_ingest, dag=dag) run_this_last = DummyOperator(task_id='run_this_last', dag=dag) t2.set_upstream(t1) run_this_last.set_upstream(t2)
dag=dag, table='{}.{}'.format(dependency["db"], dependency["table"]), partition=dependency["partition"], ) dependency_list.append(wait_for) #Create full path for the file hql_file_path = os.path.join(os.path.dirname(__file__), source['hql']) print hql_file_path run_hive_query = HiveOperator(task_id='run_hive_query', dag=dag, hql=""" {{ local_hive_settings }} """ + "\n " + open(hql_file_path, 'r').read()) # dummy task all_tasks = DummyOperator(task_id='all_tasks', dag=dag, on_success_callback=send_task_success) # mark dependencies for dependency in dependency_list: dependency.set_downstream(run_hive_query) all_tasks.set_upstream(run_hive_query) #So that mulitple dags could be created # https://airflow.incubator.apache.org/faq.html#how-can-i-create-dags-dynamically globals()[dag_id] = dag
t.set_upstream(dummy_op) t.set_downstream(create_temp_scores_table_op) archive_trained_models_op = BashOperator( task_id='archive_trained_models', bash_command='scripts/bash/archive_trained_models.sh', dag=dag ) notify_processing_completion_op = SlackAPIPostOperator( task_id='notify_processing_completion', token=Variable.get('slack_token'), channel='#engineering-commits', username='******', icon_url=Variable.get('tia_slack_icon_url'), text='*user_work_experience_job_posting_similarity_scores* has been refreshed on {{ts}}', dag=dag ) create_temp_scores_table_op.set_downstream(copy_scores_to_temp_table_op) copy_scores_to_temp_table_op.set_downstream(remove_scores_op) copy_scores_to_temp_table_op.set_downstream(update_scores_op) delete_temp_scores_table_op.set_upstream(remove_scores_op) delete_temp_scores_table_op.set_upstream(update_scores_op) delete_temp_scores_table_op.set_downstream(notify_processing_completion_op) dummy_op.set_upstream(compute_title_feature_op) dummy_op.set_upstream(compute_skill_feature_op) dummy_op.set_upstream(compute_description_feature_op) dummy_op.set_downstream(archive_trained_models_op)
dag2 = DAG(dag_id='test_depends_on_past', default_args=default_args) dag2_task1 = DummyOperator( task_id='test_dop_task', dag=dag2, depends_on_past=True,) # DAG tests that a Dag run that doesn't complete is marked failed dag3 = DAG(dag_id='test_dagrun_states_fail', default_args=default_args) dag3_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag3, python_callable=fail) dag3_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag3,) dag3_task2.set_upstream(dag3_task1) # DAG tests that a Dag run that completes but has a failure is marked success dag4 = DAG(dag_id='test_dagrun_states_success', default_args=default_args) dag4_task1 = PythonOperator( task_id='test_dagrun_fail', dag=dag4, python_callable=fail, ) dag4_task2 = DummyOperator( task_id='test_dagrun_succeed', dag=dag4, trigger_rule=TriggerRule.ALL_FAILED ) dag4_task2.set_upstream(dag4_task1)
'owner': 'airflow', 'start_date': datetime.now() - timedelta(seconds=10), 'retries': 0 } dag = DAG('Sales_Nov', default_args=default_args, start_date=datetime.now() - timedelta(seconds=10)) op1 = DummyOperator(task_id='File1_landing', dag=dag) t1 = EmailOperator(task_id='Processing_File_1', to='*****@*****.**', subject="Airflow_report", html_content="File 1 started", dag=dag) op2 = DummyOperator(task_id='File2_landing', dag=dag) t2 = EmailOperator(task_id='Processing_File_2', to='*****@*****.**', subject="Airflow_report", html_content="File 2 started", dag=dag) op3 = DummyOperator(task_id='Aggregating', dag=dag) op4 = DummyOperator(task_id='Final_Table_Push', dag=dag) t1.set_upstream(op1) t2.set_upstream(op2) op3.set_upstream(t1) op3.set_upstream(t2) op4.set_upstream(op3)
schedule_interval="30 17 * * *" # 这里可以填crontab时间格式 ) task0 = DummyOperator(task_id='task0', dag=dag) cmd = 'ls -l' task1 = BashOperator(task_id='task1', bash_command=cmd, dag=dag) task0.set_downstream(task1) task2 = DummyOperator(trigger_rule='all_done', task_id='task2', dag=dag, depends_on_past=True) task2.set_upstream(task1) task3 = DummyOperator(trigger_rule='all_done', depends_on_past=True, task_id='task3', dag=dag) task3.set_upstream(task2) task4 = BashOperator(task_id='task4', bash_command='lsfds-ljss', dag=dag) task5 = DummyOperator(trigger_rule='all_done', task_id='task5', dag=dag) task5.set_upstream(task4) task5.set_upstream(task3)
} # BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs dag = DAG(dag_id='example_branch_dop_operator_v3', schedule_interval='*/1 * * * *', default_args=args) def should_run(ds, **kwargs): print("------------- exec dttm = {} and minute = {}".format( kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: return "oper_1" else: return "oper_2" cond = BranchPythonOperator(task_id='condition', provide_context=True, python_callable=should_run, dag=dag) oper_1 = DummyOperator(task_id='oper_1', dag=dag) oper_1.set_upstream(cond) oper_2 = DummyOperator(task_id='oper_2', dag=dag) oper_2.set_upstream(cond)
# BranchPython operator that depends on past # and where tasks may run or be skipped on # alternating runs dag = DAG(dag_id='example_branch_dop_operator_v3',schedule_interval='*/1 * * * *', default_args=args) def should_run(ds, **kwargs): print("------------- exec dttm = {} and minute = {}".format(kwargs['execution_date'], kwargs['execution_date'].minute)) if kwargs['execution_date'].minute % 2 == 0: return "oper_1" else: return "oper_2" cond = BranchPythonOperator( task_id='condition', provide_context=True, python_callable=should_run, dag=dag) oper_1 = DummyOperator( task_id='oper_1', dag=dag) oper_1.set_upstream(cond) oper_2 = DummyOperator( task_id='oper_2', dag=dag) oper_2.set_upstream(cond)
import sys from qfl.etl.data_ingest import daily_equity_price_ingest default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2015, 8, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('etl_daily', start_date=datetime(2016, 05, 01), schedule_interval="0 0 14 * MON-FRI", default_args=default_args) t2 = PythonOperator(task_id='daily_equity_price_ingest', python_callable=daily_equity_price_ingest, dag=dag) run_this_last = DummyOperator(task_id='run_this_last', dag=dag) run_this_last.set_upstream(t2)
task_id='nothing_to_remove', dag=dag ) nothing_to_update_op = DummyOperator( task_id='nothing_to_update', dag=dag ) check_job_posting_to_be_updated_op.set_downstream(check_to_remove_op) check_job_posting_to_be_updated_op.set_downstream(check_to_update_op) check_work_experience_to_be_updated_op.set_downstream(check_to_remove_op) check_work_experience_to_be_updated_op.set_downstream(check_to_update_op) update_scores_branch_op.set_upstream(check_to_update_op) remove_scores_op.set_upstream(check_to_remove_op) nothing_to_remove_op.set_upstream(check_to_remove_op) nothing_to_update_op.set_upstream(check_to_update_op) notify_processing_completion_op.set_upstream(nothing_to_remove_op) notify_processing_completion_op.set_upstream(nothing_to_update_op) update_scores_branch_op.set_downstream(compute_title_feature_op) update_scores_branch_op.set_downstream(compute_skill_feature_op) update_scores_branch_op.set_downstream(compute_description_feature_op) compute_similarity_op.set_upstream(compute_title_feature_op) compute_similarity_op.set_upstream(compute_skill_feature_op) compute_similarity_op.set_upstream(compute_description_feature_op) compute_similarity_op.set_downstream(update_scores_op)
t6 = PythonOperator(task_id='daily_optionworks_ingest', python_callable=DailyOptionWorksIngest.launch, dag=dag, provide_context=True) t7 = PythonOperator(task_id='daily_generic_index_price_ingest', python_callable=DailyGenericIndexPriceIngest.launch, dag=dag, provide_context=True) night_task_waiter = TimeSensor(task_id='night_task_2000_waiter', target_time=dt.time(hour=20, minute=0)) run_this_last = DummyOperator(task_id='run_this_last', dag=dag) # t1.set_upstream(night_task_waiter) t2.set_upstream(t1) # t3.set_upstream(night_task_waiter) # t5.set_upstream(night_task_waiter) t2.set_upstream(t7) run_this_last.set_upstream(t1) run_this_last.set_upstream(t2) run_this_last.set_upstream(t3) run_this_last.set_upstream(t4) run_this_last.set_upstream(t5) run_this_last.set_upstream(t6) run_this_last.set_upstream(t7)