'start_date': datetime(2015, 8, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('etl_daily', start_date=datetime(2016, 05, 01), schedule_interval="0 0 14 * MON-FRI", default_args=default_args) t1 = PythonOperator(task_id='test_airflow', python_callable=test_airflow, dag=dag) t2 = PythonOperator(task_id='daily_equity_price_ingest', python_callable=daily_equity_price_ingest, dag=dag) run_this_last = DummyOperator(task_id='run_this_last', dag=dag) t2.set_upstream(t1) run_this_last.set_upstream(t2)
echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ join = DummyOperator( task_id='join', trigger_rule='all_done', dag=dag ) sum_up = PythonOperator( task_id='sum_up', provide_context=True, python_callable=sum_up_task, dag=dag, execution_timeout=timedelta(seconds=60), on_failure_callback=notify_failure, ) p1.set_upstream(p0) p2.set_upstream(p0) p3.set_upstream(p0) c1.set_upstream(p1) c2.set_upstream(p2) c3.set_upstream(p3) c3.set_downstream(join) c2.set_downstream(join) sum_up.set_upstream(join)
'yearID', 'franchID', 'teamID', 'W', 'L', 'percentage', 'franchName' ], encoding='utf-8') conn.insert_rows(table=table_name, rows=results.values.tolist()) return table_name dag = DAG('zylo_example', schedule_interval=timedelta(hours=1), start_date=datetime(2016, 10, 24), default_args=default_args) t1 = PythonOperator(task_id='get_zip_file', provide_context=True, python_callable=get_zip, dag=dag) t2 = PythonOperator(task_id='get_top_teams', provide_context=True, python_callable=top_teams, dag=dag) t3 = PythonOperator(task_id='load_to_MySql', provide_context=True, python_callable=bulk_load_teams, op_kwargs={'table_name': 'top_teams'}, dag=dag) t2.set_upstream(t1) t3.set_upstream(t2)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("bcftools", default_args=default_args, schedule_interval=None, concurrency=20000, max_active_runs=20000) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) bcftools_task = PythonOperator( task_id="bcftools", python_callable=bcftools, provide_context=True, dag=dag) bcftools_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(bcftools_task)
t4 = PythonOperator(task_id='python_{}_3'.format(pub_id), python_callable=my_display_function, op_kwargs={'phase': 'EXTRACT_DATA_START'}, dag=dag) t5 = BashOperator( task_id='extractdata_{}'.format(pub_id), pool='simba_extract_data', bash_command= 'sh /x/home/dm_hdp_batch/test/projects/steam_donkey/scripts/export_processing.sh ', dag=dag) t6 = PythonOperator(task_id='python_{}_4'.format(pub_id), python_callable=my_display_function, op_kwargs={'phase': 'EXTRACT_DATA_END'}, dag=dag) t7 = TriggerDagRunOperator(task_id='trigger_{}_1'.format(pub_id), trigger_dag_id="SUB_{}_{}".format( sub_id, sub_id_ver), python_callable=conditionally_trigger, params={ 'condition_param': True, 'message': 'Hello World' }, dag=dag) t1.set_upstream(t0) t2.set_upstream(t1) t3.set_upstream(t2) t4.set_upstream(t3) t5.set_upstream(t4) t6.set_upstream(t5) t7.set_upstream(t6)
def generate_dag(area, download_dir, default_args): """Generate Landsat8 ingestion DAGs. Parameters ---------- area: Landsat8Area Configuration parameters for the Landsat8 area to be downloaded default_args: dict Default arguments for all tasks in the DAG. """ dag = DAG( LANDSAT8.id + "_{}".format(area.name), description="DAG for downloading, processing and ingesting {} AOI in Landsat8 data " "from scene_list".format(area.name), default_args=default_args, dagrun_timeout=LANDSAT8.dagrun_timeout, schedule_interval=LANDSAT8.dag_schedule_interval, catchup=LANDSAT8.catchup, params={ "area": area, } ) search_task = Landsat8SearchOperator( task_id='search_{}'.format(area.name), area=area, cloud_coverage=LANDSAT8.cloud_coverage, startdate = LANDSAT8.startdate, enddate = LANDSAT8.enddate, filter_max =LANDSAT8.filter_max, order_by =LANDSAT8.order_by, order_type =LANDSAT8.order_type, db_credentials= CFG.landsat8_postgresql_credentials, dag=dag ) generate_html_description = Landsat8ProductDescriptionOperator( task_id='generate_html_description', description_template=os.path.join( TEMPLATES_PATH, "product_abstract.html"), download_dir=download_dir, dag=dag ) download_thumbnail = Landsat8DownloadOperator( task_id="download_thumbnail", download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="thumb_small.jpg", download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) generate_thumbnail = Landsat8ThumbnailOperator( task_id='generate_thumbnail', get_inputs_from=download_thumbnail.task_id, thumb_size_x="64", thumb_size_y="64", dag=dag ) download_metadata = Landsat8DownloadOperator( task_id="download_metadata", download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="MTL.txt", download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) join_task = DummyOperator( task_id='landsat8_join', dag=dag ) download_tasks = [] translate_tasks = [] addo_tasks = [] upload_tasks = [] gdalinfo_tasks = [] for band in area.bands: download_band = Landsat8DownloadOperator( task_id="download_band{}".format(band), download_dir=download_dir, get_inputs_from=search_task.task_id, url_fragment="B{}.TIF".format(band), download_max=LANDSAT8.download_max, geoserver_rest_url=CFG.geoserver_rest_url, geoserver_oseo_collection=LANDSAT8.geoserver_oseo_collection, geoserver_username=CFG.geoserver_username, geoserver_password=CFG.geoserver_password, dag=dag ) download_tasks.append(download_band) translate = GDALTranslateOperator( task_id="translate_band{}".format(band), get_inputs_from=download_band.task_id, dag=dag ) translate_tasks.append(translate) addo = GDALAddoOperator( task_id="add_overviews_band{}".format(band), get_inputs_from=translate.task_id, resampling_method="average", max_overview_level=128, compress_overview="PACKBITS", dag=dag ) addo_tasks.append(addo) gdalinfo = GDALInfoOperator( task_id='landsat8_gdalinfo_band_{}'.format(band), get_inputs_from=addo.task_id, dag=dag ) gdalinfo_tasks.append(gdalinfo) upload = RSYNCOperator( task_id="upload_band{}".format(band), host=CFG.rsync_hostname, remote_usr=CFG.rsync_username, ssh_key_file=CFG.rsync_ssh_key, remote_dir=LANDSAT8.repository_dir, get_inputs_from=addo.task_id, dag=dag) upload_tasks.append(upload) download_band.set_upstream(search_task) translate.set_upstream(download_band) addo.set_upstream(translate) gdalinfo.set_upstream(addo) upload.set_upstream(addo) join_task.set_upstream(upload) join_task.set_upstream(gdalinfo) download_task_ids = ( task.task_id for task in download_tasks ) create_original_package_task = PythonOperator(task_id="create_original_package", python_callable=create_original_package, op_kwargs={ 'get_inputs_from': { "search_task_id" : search_task.task_id, "download_task_ids" : download_task_ids, } , 'out_dir' : LANDSAT8.process_dir }, dag=dag) upload_original_package_task = RSYNCOperator( task_id="upload_original_package", host=CFG.rsync_hostname, remote_usr=CFG.rsync_username, ssh_key_file=CFG.rsync_ssh_key, remote_dir=LANDSAT8.original_package_upload_dir, get_inputs_from=create_original_package_task.task_id, dag=dag) # we only neeed gdalinfo output on one of the granules gdalinfo_task = gdalinfo_tasks[0] gdalinfo_task_id = gdalinfo_task.task_id upload_task_ids = (task.task_id for task in upload_tasks) generate_metadata = Landsat8MTLReaderOperator( task_id='generate_metadata', original_package_download_base_url = LANDSAT8.original_package_download_base_url, gs_workspace = LANDSAT8.geoserver_workspace, gs_wms_layer = LANDSAT8.geoserver_layer, gs_wms_width = LANDSAT8.geoserver_oseo_wms_width, gs_wms_height = LANDSAT8.geoserver_oseo_wms_height, gs_wms_format = LANDSAT8.geoserver_oseo_wms_format, gs_wms_version = LANDSAT8.geoserver_oseo_wms_version, gs_wfs_featuretype = LANDSAT8.geoserver_featuretype, gs_wfs_format = LANDSAT8.geoserver_oseo_wfs_format, gs_wfs_version=LANDSAT8.geoserver_oseo_wfs_version, gs_wcs_scale_i = LANDSAT8.geoserver_oseo_wcs_scale_i, gs_wcs_scale_j = LANDSAT8.geoserver_oseo_wcs_scale_j, gs_wcs_format = LANDSAT8.geoserver_oseo_wcs_format, gs_wcs_version = LANDSAT8.geoserver_oseo_wcs_version, gs_wcs_coverage_id = LANDSAT8.geoserver_layer, get_inputs_from={ "search_task_id" : search_task.task_id, "metadata_task_id": download_metadata.task_id, "upload_task_ids" : upload_task_ids, "gdalinfo_task_id": gdalinfo_task_id, "upload_original_package_task_id": upload_original_package_task.task_id, }, metadata_xml_path=os.path.join(TEMPLATES_PATH, "metadata.xml"), dag=dag ) product_zip_task = Landsat8ProductZipFileOperator( task_id='landsat8_product_zip', get_inputs_from=[ generate_html_description.task_id, generate_metadata.task_id, generate_thumbnail.task_id ], output_dir=LANDSAT8.process_dir, dag=dag ) # curl -vvv -u evoadmin:\! -XPOST -H "Content-type: application/zip" --data-binary @/var/data/Sentinel-2/S2_MSI_L1C/download/S2A_MSIL1C_20170909T093031_N0205_R136_T36VUQ_20170909T093032/product.zip "http://ows-oda.eoc.dlr.de/geoserver/rest/oseo/collections/SENTINEL2/products" publish_task = PythonOperator(task_id="publish_product_task", python_callable=publish_product, op_kwargs={ 'geoserver_username': CFG.geoserver_username, 'geoserver_password': CFG.geoserver_password, 'geoserver_rest_endpoint': '{}/oseo/collections/{}/products'.format( CFG.geoserver_rest_url, LANDSAT8.geoserver_oseo_collection), 'get_inputs_from': product_zip_task.task_id, }, dag=dag) download_thumbnail.set_upstream(search_task) download_metadata.set_upstream(search_task) for tid in download_tasks: create_original_package_task.set_upstream(tid) upload_original_package_task.set_upstream(create_original_package_task) generate_metadata.set_upstream(join_task) generate_metadata.set_upstream(download_metadata) generate_metadata.set_upstream(upload_original_package_task) generate_thumbnail.set_upstream(download_thumbnail) generate_html_description.set_upstream(search_task) product_zip_task.set_upstream(generate_html_description) product_zip_task.set_upstream(generate_metadata) product_zip_task.set_upstream(generate_thumbnail) publish_task.set_upstream(upload_original_package_task) publish_task.set_upstream(product_zip_task) return dag
python_callable=CheckReadLogs(), dag=dag) put_file = PythonOperator( task_id='put-file-to-s3', python_callable=DataPutter(), dag=dag) delete_object = PythonOperator( task_id='delete-object-from-s3', python_callable=DeleteObject(), dag=dag) cleanup = BashOperator( task_id='cleanup', bash_command=rm_file, trigger_rule=TriggerRule.ALL_DONE, dag=dag) get_file.set_upstream(put_file) hello_world_docker_write_logs.set_upstream(get_file) check_read_logs.set_upstream(hello_world_docker_write_logs) cleanup.set_upstream(check_read_logs) cleanup.set_upstream(get_file) delete_object.set_upstream(get_file)
subject='Latest popular links', html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) sub = SubDagOperator(subdag=subdag, task_id='insert_and_id_pop', trigger_rule='one_success', dag=dag) clear_latest = BashOperator( bash_command='rm -rf {}/latest_links.txt'.format(RAW_TWEET_DIR), task_id='clear_latest', dag=dag) gen_search_terms.set_upstream(fill_search_terms) for term in SEARCH_TERMS: term_without_punctuation = re.sub(r'\W+', '', term) simple_search = PythonOperator( task_id='search_{}_twitter'.format(term_without_punctuation), provide_context=True, python_callable=search_twitter, dag=dag, params={'query': term}) simple_search.set_upstream(gen_search_terms) simple_search.set_downstream(sub) sub.set_downstream(email_links) email_links.set_downstream(clear_latest)
task_id='hive_s3_location', command_type="hivecmd", script_location="s3n://dev.canopydata.com/airflow/show_table.hql", notfiy=True, tags=['tag1', 'tag2'], trigger_rule="all_done", dag=dag) t3 = PythonOperator( task_id='compare_result', provide_context=True, python_callable=compare_result, trigger_rule="all_done", dag=dag) t3.set_upstream(t1) t3.set_upstream(t2) options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(t3) join = DummyOperator( task_id='join', trigger_rule='one_success', dag=dag
default_args=default_args, schedule_interval=None, concurrency=10000, max_active_runs=2000) start_analysis_run_task = PythonOperator(task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) validate_sample_task = PythonOperator(task_id="validate_sample", python_callable=validate_sample, provide_context=True, dag=dag) validate_sample_task.set_upstream(start_analysis_run_task) delly_task = PythonOperator(task_id="delly_genotype", python_callable=run_delly, provide_context=True, dag=dag) delly_task.set_upstream(validate_sample_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(delly_task)
dag=dag) deliver_res_op = """ cp -r {{ params.project_dir }}/{{ params.dag_id }}_results/* {{ params.project_dir }}/results """ deliver_res = BashOperator(task_id='Deliver_result', bash_command=deliver_res_op, params={ 'project_dir': project_directory, 'dag_id': dag_id }, dag=dag) preprosessing.set_upstream(clean_up) ms_concatenation.set_upstream(preprosessing) clustering_or.set_upstream(ms_concatenation) taxo_assignation.set_upstream(clustering_or) biom_generation.set_upstream(clustering_or) biom_generation.set_upstream(taxo_assignation) tree_generation.set_upstream(clustering_or) filter_weak_otus.set_upstream(biom_generation) biom_conversion.set_upstream(filter_weak_otus) raw_matrix_generation.set_upstream(biom_conversion) matrix_normalization.set_upstream(raw_matrix_generation) matrix_consolidation.set_upstream(matrix_normalization) output_res.set_upstream(ms_concatenation) output_res.set_upstream(tree_generation) output_res.set_upstream(biom_conversion) output_res.set_upstream(raw_matrix_generation) output_res.set_upstream(matrix_consolidation)
start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) validate_sample_task = PythonOperator( task_id="validate_sample", python_callable=validate_sample, provide_context=True, dag=dag) validate_sample_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) for contig_name in tracker.util.workflow_common.CONTIG_NAMES: freebayes_task = PythonOperator( task_id="freebayes_" + contig_name, python_callable=run_freebayes, op_kwargs={"contig_name": contig_name}, provide_context=True, dag=dag)
# We want subprocess output to bypass logging module otherwise multiline # output is squashed together. util.run(args, use_print=True, dryrun=dryrun) build_op = PythonOperator(task_id='build_images', provide_context=True, python_callable=build_images, dag=dag) setup_cluster_op = PythonOperator(task_id='setup_cluster', provide_context=True, python_callable=setup_cluster, dag=dag) setup_cluster_op.set_upstream(build_op) run_tests_op = PythonOperator(task_id='run_tests', provide_context=True, python_callable=run_tests, dag=dag) run_tests_op.set_upstream(setup_cluster_op) teardown_cluster_op = PythonOperator(task_id='teardown_cluster', provide_context=True, python_callable=teardown_cluster, dag=dag) teardown_cluster_op.set_upstream(run_tests_op)
def create_dag(dag_id, schedule, start_date, delta_sensor, airpots_codes, default_args): dag = DAG(dag_id, schedule_interval=schedule, start_date=start_date, default_args=default_args) dag.doc_md = """ # DAG fetching data from smiles.com.ar ### procesing and dumping on postgresql """ """start = TimeDeltaSensor( task_id='wait_to_start', delta=timedelta(minutes=delta_sensor), dag=dag)""" start = DummyOperator(task_id="start", dag=dag) branches = [] def return_dates_branches(**kwargs): return branches gen_url_branch = BranchPythonOperator( task_id='generate_url_dates', provide_context=True, python_callable=return_dates_branches, dag=dag) def transform_data(**kwargs): ti = kwargs['ti'] raw_data = ti.xcom_pull(task_ids=return_dates_branches()) data = [] logging.info(raw_data) if raw_data is not None: flat_list = [item for sublist in raw_data for item in sublist] for row in flat_list: row = list(row) # add À-ÿ for spanish accents date = '/'.join( list( re.compile("([A-ZÀ-ÿ]+)(\d+)([A-ZÀ-ÿ]+)").split( row[1]))[2:4]) date = dateparser.parse(date, languages=['pt', 'es'], date_formats=['%d/%b' ]).strftime('%Y-%m-%d') row[1] = date td = row[4].split(':') row[4] = str(timedelta(hours=int(td[0]), minutes=int(td[1]))) row[5] = int(row[5].replace('.', '')) row[6] = int(row[6].replace('.', '')) row[8] = row[8].split(' ')[-1] row.insert(0, datetime.now().strftime('%Y-%m-%d')) data.append(tuple(row)) return data else: print('No se recibio datos') t2 = PythonOperator( task_id='transform_data', python_callable=transform_data, depends_on_past=True, trigger_rule=TriggerRule.ALL_SUCCESS, provide_context=True, dag=dag, ) t2.doc_md = """ #### Task Documentation Transform fetched data @return a list of tuples """ # def gen_url_dates(**kwargs): date_start = read_scraped_date(airpots_codes) date_end = date_start + timedelta(days=AMOUNT_DAYS) date_generated = [ date_start + timedelta(days=x) for x in range(0, (date_end - date_start).days) ] for i, date in enumerate(date_generated): date_ml = str(date.timestamp())[:8] + '00000' url_dated = """https://www.smiles.com.ar/emission?originAirportCode={}&destinationAirportCode={}&departureDate={}&adults=1&children=0&infants=0&isFlexibleDateChecked=false&tripType=3¤cyCode=BRL&segments=2&departureDate2={}&originAirportCode2={}&destinationAirportCode2={}""".format( airpots_codes[0][0], airpots_codes[1], date_ml, date_ml, airpots_codes[0][1], airpots_codes[1]) get_data_op = PythonOperator( task_id='get_data_{}and{}to{}_{}'.format(airpots_codes[0][0], airpots_codes[0][1], airpots_codes[1], i), python_callable=get_data_URL, op_kwargs={'URL': url_dated}, trigger_rule=TriggerRule.ONE_SUCCESS, provide_context=True, dag=dag, ) branches.append(get_data_op.task_id) get_data_op.set_upstream(gen_url_branch) get_data_op.set_downstream(t2) get_data_op.doc_md = """ #### Task Documentation Fetch data from passed url return list of semi-parsed data """ insert_data = PythonOperator( task_id='insert_data', python_callable=insert_into_table, provide_context=True, dag=dag, ) insert_data.doc_md = """ #### Task Documentation Insert parsed and transformed data into table """ t2.set_downstream(insert_data) gen_url_branch.set_upstream(start) return dag
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("test", default_args=default_args, schedule_interval=None, concurrency=20000, max_active_runs=20000) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) test_task = PythonOperator( task_id="test", python_callable=run_test, provide_context=True, dag=dag) test_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(test_task)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("sanger_variant_calling", default_args=default_args, schedule_interval=None, concurrency=500, max_active_runs=500) start_analysis_run_task = PythonOperator(task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) run_sanger_callers_task = PythonOperator(task_id="run_sanger_callers", python_callable=run_sanger_callers, provide_context=True, dag=dag) run_sanger_callers_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(run_sanger_callers_task)
max_overview_level=MAX_OVERVIEW_LEVEL, task_id='gdal_addo_' + str(i), get_inputs_from=warp.task_id, dag=dag) addo_tasks.append(addo) upload = RSYNCOperator(task_id="upload_granule_{}_task".format(str(i)), host=CFG.rsync_hostname, remote_usr=CFG.rsync_username, ssh_key_file=CFG.rsync_ssh_key, remote_dir=S1GRD1SDV.repository_dir, get_inputs_from=addo.task_id, dag=dag) upload_tasks.append(upload) band_paths.set_upstream(zip_task) warp.set_upstream(band_paths) addo.set_upstream(warp) upload.set_upstream(addo) # Metadata Extraction task addo_task_ids = (task.task_id for task in addo_tasks) upload_task_ids = (task.task_id for task in upload_tasks) metadata_task = S1MetadataOperator( task_id="extract_metadata_task", product_safe_path=None, granules_paths=None, granules_upload_dir=S1GRD1SDV.repository_dir, processing_dir=S1GRD1SDV.process_dir, original_package_download_base_url=S1GRD1SDV. original_package_download_base_url,
args = { 'owner': 'airflow', 'start_date': datetime.now(), } dag = DAG(dag_id='my_first_dag', default_args=args, schedule_interval=None) def print_context(i): print(i) return 'print_context has sucess {}'.format(i) parent = None for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = \ PythonOperator( task_id='print_the_context.{}'.format(i), python_callable=print_context, op_kwargs={'i': i}, dag=dag) if parent: task.set_upstream(parent) parent = task
t1 = PythonOperator( task_id='clear_scrape_folder', python_callable=clear_folder, dag=dag) # TODO properly import python classes t2 = BashOperator( task_id='scrape_profile_images', bash_command='cd {} && scrapy crawl csgrad'.format(cspeople_scraper), dag=dag) t3 = PythonOperator( task_id='scrape_progress', python_callable=print_scrape_in_progress, dag=dag) t4 = BashOperator( task_id='create_landmarks', bash_command='cd {} && python landmark.py'.format(averageface_path), dag=dag) t5 = BashOperator( task_id='create_average_face', bash_command='cd {} && python averageface.py'.format(averageface_path), dag=dag) t2.set_upstream(t1) t3.set_upstream(t1) t4.set_upstream(t2) t4.set_upstream(t3) t5.set_upstream(t4)
default_args=default_args, schedule_interval=None, concurrency=10000, max_active_runs=2000) start_analysis_run_task = PythonOperator(task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) validate_sample_task = PythonOperator(task_id="validate_sample", python_callable=validate_sample, provide_context=True, dag=dag) validate_sample_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) for contig_name in tracker.util.workflow_common.CONTIG_NAMES: freebayes_task = PythonOperator(task_id="freebayes_" + contig_name, python_callable=run_freebayes, op_kwargs={"contig_name": contig_name}, provide_context=True, dag=dag) freebayes_task.set_upstream(validate_sample_task)
def done(**_kwargs): logging.info("Executing done step.") clone_op = PythonOperator(task_id='clone_repo', provide_context=True, python_callable=clone_repo, dag=dag) build_op = PythonOperator(task_id='build_images', provide_context=True, python_callable=build_images, dag=dag) build_op.set_upstream(clone_op) py_lint_op = PythonOperator(task_id='pylint', provide_context=True, python_callable=py_checks_gen("lint"), dag=dag) py_lint_op.set_upstream(clone_op) py_test_op = PythonOperator(task_id='pytest', provide_context=True, python_callable=py_checks_gen("test"), dag=dag) py_test_op.set_upstream(clone_op) setup_cluster_op = PythonOperator(task_id='setup_cluster', provide_context=True,
fetch_tweets = PythonOperator( task_id='fetch_tweets', python_callable=fetchtweets, dag=dag) # -------------------------------------------------------------------------------- # Clean the eight files. In this step you can get rid of or cherry pick columns # and different parts of the text # -------------------------------------------------------------------------------- clean_tweets = PythonOperator( task_id='clean_tweets', python_callable=cleantweets, dag=dag) clean_tweets.set_upstream(fetch_tweets) # -------------------------------------------------------------------------------- # In this section you can use a script to analyze the twitter data. Could simply # be a sentiment analysis through algorithms like bag of words or something more # complicated. You can also take a look at Web Services to do such tasks # -------------------------------------------------------------------------------- analyze_tweets = PythonOperator( task_id='analyze_tweets', python_callable=analyzetweets, dag=dag) analyze_tweets.set_upstream(clean_tweets) # --------------------------------------------------------------------------------
'retry_delay': timedelta(minutes=5), } dag = DAG("filter-vcf", default_args=default_args, schedule_interval=None, concurrency=20000, max_active_runs=20000) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) filter_task = PythonOperator( task_id="filter_variants", python_callable=filter_variants, provide_context=True, dag=dag) filter_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(filter_task)
} }, { '$out': tmp_created_collection_per_hour_name }] results = db.logs.aggregate(pipeline) print("Aggregated hour metrics") return 'Whatever you return gets printed in the logs' run_this = PythonOperator(task_id='connect_to_mongodb_and_aggregate_day', provide_context=True, python_callable=connect_to_mongodb_and_aggregate_day, dag=dag) run_this_also = PythonOperator( task_id='connect_to_mongodb_and_aggregate_hour', provide_context=True, python_callable=connect_to_mongodb_and_aggregate_hour, dag=dag) run_this_also.set_upstream(run_this) send_email_notification_flow_successful = EmailOperator( task_id='send_email_notification_flow_successful', to="*****@*****.**", subject='custom email from airflow', html_content="{{ params['foo'](execution_date) }}", params=params, dag=dag) send_email_notification_flow_successful.set_upstream(run_this_also)
simple_search = PythonOperator(task_id='search_twitter', provide_context=True, python_callable=search_twitter, dag=dag, params={'query': '#python'}) move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite', provide_context=True, python_callable=csv_to_sqlite, dag=dag) id_popular = PythonOperator(task_id='identify_popular_links', provide_context=True, python_callable=identify_popular_links, dag=dag) email_links = EmailOperator(task_id='email_best_links', to='*****@*****.**', subject='Latest popular links', html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) simple_search.set_downstream(move_tweets_to_sqlite) id_popular.set_upstream(move_tweets_to_sqlite) email_links.set_upstream(id_popular)
html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) sub = SubDagOperator(subdag=subdag, task_id='insert_and_id_pop', trigger_rule='one_success', dag=dag) clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format( RAW_TWEET_DIR), task_id='clear_latest', dag=dag) gen_search_terms.set_upstream(fill_search_terms) for term in SEARCH_TERMS: term_without_punctuation = re.sub(r'\W+', '', term) simple_search = PythonOperator( task_id='search_{}_twitter'.format(term_without_punctuation), provide_context=True, python_callable=search_twitter, dag=dag, params={'query': term}) simple_search.set_upstream(gen_search_terms) simple_search.set_downstream(sub) sub.set_downstream(email_links) email_links.set_downstream(clear_latest)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("bcftools", default_args=default_args, schedule_interval=None, concurrency=20000, max_active_runs=20000) start_analysis_run_task = PythonOperator(task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) bcftools_task = PythonOperator(task_id="bcftools", python_callable=bcftools, provide_context=True, dag=dag) bcftools_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(bcftools_task)
task_id='setup_jobs', provide_context=True, python_callable=setup_jobs_fn, dag=dag) def collect_results_fn(ds, **kwargs): pprint(kwargs) print(ds) collect_results = PythonOperator( task_id='collect_results', provide_context=True, python_callable=collect_results_fn, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator( task_id='sleep_for_'+str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': float(i)/10}, dag=dag) task.set_upstream(setup_jobs) task.set_downstream(collect_results)
'email_on_retry': False } # Set concurrency and max_active_runs to 1, preventing more than one dag instance # from being created. dag = DAG(dag_name, default_args=task_args, concurrency=1, max_active_runs=1, schedule_interval=schedule_interval) get_env = PythonOperator( task_id='get-config-from-s3', python_callable=ConfigGetter(), dag=dag) set_variables = PythonOperator( task_id='set-variables', python_callable=BootStrapper(), dag=dag) cleanup = BashOperator( task_id='cleanup', bash_command=rm_config, trigger_rule='all_done', dag=dag) set_variables.set_upstream(get_env) cleanup.set_upstream(set_variables)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("msisensor", default_args=default_args, schedule_interal=None, concurrency=10000, max_active_runs=2000) start_analysis_run_task = PythonOperator(task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) msisensor_task = PythonOperator(task_id='msisensor', python_callable=run_msisensor, provide_context=True, dag=dag) msisensor_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run_task, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(msisensor_task)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("sanger_bwa", default_args=default_args, schedule_interval=None, concurrency=500, max_active_runs=500) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) run_bwa_task = PythonOperator( task_id="run_bwa", python_callable=run_bwa, provide_context=True, dag=dag) run_bwa_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(run_bwa_task)
if table_exists: sqls = [drop_table, create_table, load_data] for i in sqls: redshift_call(i) else: sqls = [create_table, load_data] for i in sqls: redshift_call(i) postgres_to_local_csv = PythonOperator( task_id='postgres_to_local_csv', provide_context=True, python_callable=get_orders_with_bellhops, dag=dag) local_csv_to_s3 = PythonOperator( task_id='local_csv_to_s3', provide_context=True, python_callable=store_orders_with_bellhops, dag=dag) s3_to_redshift = PythonOperator( task_id='s3_to_redshift', provide_context=True, python_callable=transfer_orders_with_bellhops, dag=dag) local_csv_to_s3.set_upstream(postgres_to_local_csv) s3_to_redshift.set_upstream(local_csv_to_s3)
def print_hello_world(): print('this_should_print_hello_world from python') # Following are defaults which can be overridden later on default_args = { 'owner': 'Jackie G', 'depends_on_past': False, 'start_date': datetime(2016, 4, 15), 'email': ['jackies-email'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } dag = DAG('Helloworld', default_args=default_args) t1 = BashOperator( task_id='hello_from_bash', bash_command='echo "Task 1 says hello"', dag=dag) t2 = PythonOperator( task_id='hello_from_python', python_callable=print_hello_world, dag=dag) t2.set_upstream(t1)
cluster_id="{{ ti.xcom_pull(key="emr_cluster_id", task_ids="clean_emr_id") }}" echo $cluster_id aws emr add-steps --cluster-id $cluster_id --steps Type=spark,Name=pyspark_job,\ Jar="command-runner.jar",\ Args=[\ --deploy-mode,client,\ s3://$bc/sparky.py\ ],ActionOnFailure=TERMINATE_CLUSTER ''' # .format(bucket_pyton, driver_cores, driver_memory, executor_memory, executor_cores) start_emr = BashOperator(task_id='start_emr', bash_command=start_emr, provide_context=True, xcom_push=True, params={"bucket_log": bucket_log}, dag=dag) clean_emr_id = PythonOperator(task_id='clean_emr_id', python_callable=parse_emr_id, provide_context=True, dag=dag) add_step = BashOperator(task_id='add_step', bash_command=add_step, provide_context=True, dag=dag) add_step.set_upstream(clean_emr_id) clean_emr_id.set_upstream(start_emr)
t2 = PythonOperator(task_id='Twitter_Authorisation', python_callable=maintts, dag=dag) t3 = PythonOperator(task_id='Fetching_Data', python_callable=fetchsamples, dag=dag) t4 = PythonOperator(task_id='RSA_Key_256SHA', python_callable=rsakey, dag=dag) t5 = PythonOperator(task_id='Directory_List', python_callable=dirlis, dag=dag) t6 = PythonOperator(task_id='Face_Detection', python_callable=faceid, dag=dag) t7 = PythonOperator(task_id='Video_Capture', python_callable=vidcap, dag=dag) t8 = PythonOperator(task_id='Read_Validate_Json', python_callable=read, dag=dag) t9 = PythonOperator(task_id='TOP_10_Places', python_callable=top_10, dag=dag) tf = PythonOperator(task_id='End_Point', python_callable=end, dag=dag) t2.set_upstream(t1) t4.set_upstream(t1) t3.set_upstream(t2) t5.set_upstream(t4) t8.set_upstream(t3) t9.set_upstream(t8) t6.set_upstream(t5) t7.set_upstream(t6) tf.set_upstream(t7, t9)
dag = DAG("example", default_args=default_args, schedule_interval=timedelta(1)) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag) t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator( task_id="templated", bash_command=templated_command, params={"my_param": "Parameter I passed in"}, dag=dag, ) t4 = PythonOperator(task_id="python_code", python_callable=example_function, dag=dag) t2.set_upstream(t1) t3.set_upstream(t1) t4.set_upstream(t1)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("pcawg_bwa", default_args=default_args, schedule_interval=None, concurrency=50, max_active_runs=50) start_analysis_run_task = PythonOperator(task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) run_bwa_task = PythonOperator(task_id="run_bwa", python_callable=run_bwa, provide_context=True, dag=dag) run_bwa_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(run_bwa_task)
def connect_to_mongodb_and_aggregate_hour(ds, **kwargs): db = MongoClient().test tmp_created_collection_per_hour_name = 'page_per_hour_hits_tmp'; pipeline = [{"$project":{'page': '$PAGE', 'time': { 'y': {'$year':'$DATE'} , 'm':{'$month':'$DATE'}, 'day':{'$dayOfMonth':'$DATE'}, 'h':{'$hour':'$DATE'}}}}, {'$group':{'_id':{'p':'$page','y':'$time.y','m':'$time.m','d':'$time.day', 'h':'$time.h'}, 'hourly':{'$sum':1}}},{'$out': tmp_created_collection_per_hour_name}] results = db.logs.aggregate(pipeline) print("Aggregated hour metrics") return 'Whatever you return gets printed in the logs' run_this = PythonOperator( task_id='connect_to_mongodb_and_aggregate_day', provide_context=True, python_callable=connect_to_mongodb_and_aggregate_day, dag=dag) run_this_also = PythonOperator( task_id='connect_to_mongodb_and_aggregate_hour', provide_context=True, python_callable=connect_to_mongodb_and_aggregate_hour, dag=dag) run_this_also.set_upstream(run_this) send_email_notification_flow_successful = EmailOperator( task_id='send_email_notification_flow_successful', to="*****@*****.**", subject='custom email from airflow', html_content="{{ params['foo'](execution_date) }}", params=params, dag=dag) send_email_notification_flow_successful.set_upstream(run_this_also)
""" Simple subdag example """ from airflow import DAG from airflow.operators import PythonOperator from twitter_airflow import csv_to_sqlite, identify_popular_links from datetime import datetime, timedelta default_args = { 'owner': 'admin', 'depends_on_past': False, 'start_date': datetime(2016, 1, 1), 'retries': 1, 'retry_delay': timedelta(minutes=5), } subdag = DAG('generate_twitter_dags.insert_and_id_pop', default_args=default_args) move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite', provide_context=True, python_callable=csv_to_sqlite, dag=subdag) id_popular = PythonOperator(task_id='identify_popular_links', provide_context=True, python_callable=identify_popular_links, dag=subdag, params={'write_mode': 'a'}) id_popular.set_upstream(move_tweets_to_sqlite)
start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) validate_sample_task = PythonOperator( task_id="validate_sample", python_callable=validate_sample, provide_context=True, dag=dag) validate_sample_task.set_upstream(start_analysis_run_task) delly_task = PythonOperator( task_id="delly_genotype", python_callable=run_delly, provide_context=True, dag=dag) delly_task.set_upstream(validate_sample_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag)
def my_sleeping_function(random_base): '''This is a function that will run within the DAG execution''' time.sleep(random_base) def connect_to_mongodb_and_print(ds, **kwargs): db = MongoClient().zips buildinfo = db.command("buildinfo") print(buildinfo) return 'Whatever you return gets printed in the logs' run_this = PythonOperator( task_id='connect_to_mongodb_and_print', provide_context=True, python_callable=connect_to_mongodb_and_print, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator( task_id='sleep_for_'+str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': i}, dag=dag) task.set_upstream(run_this)
if res is not None and len(res) > 0: category_id = res[0] sql = """ insert into alpha.notecard_categories (notecard_id, category_id) values (%s,%s) """ cur.execute(sql, (notecard_id, category_id)) conn.commit() # move file to process folder upon completion shutil.move(os.path.join(filepath, file), os.path.join(destination, file)) return True populate_task = PythonOperator( task_id='populate_csv', provide_context=True, depends_on_past=True, python_callable=populate_db, dag=dag) # trigger = TriggerDagRunOperator( # task_id='trigger_dag_rerun', # trigger_dag_id=task_name, # dag=dag) populate_task.set_upstream(sensor_task)
'''This is a function that will run within the DAG execution''' time.sleep(random_base) def connect_to_monary_and_print_aggregation(ds, **kwargs): m = Monary() pipeline = [{"$group": {"_id": "$state", "totPop": {"$sum": "$pop"}}}] states, population = m.aggregate("zips", "data", pipeline, ["_id", "totPop"], ["string:2", "int64"]) strs = list(map(lambda x: x.decode("utf-8"), states)) result = list("%s: %d" % (state, pop) for (state, pop) in zip(strs, population)) print (result) return 'Whatever you return gets printed in the logs' run_this = PythonOperator( task_id='connect_to_monary_and_print_aggregation', provide_context=True, python_callable=connect_to_monary_and_print_aggregation, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator( task_id='sleep_for_'+str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': i}, dag=dag) task.set_upstream(run_this)
default_args=default_args, schedule_interval=None, concurrency=50, max_active_runs=50) start_analysis_run_task = PythonOperator(task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) metadata_task = PythonOperator(task_id="prepare_metadata", python_callable=prepare_metadata, provide_context=True, dag=dag) metadata_task.set_upstream(start_analysis_run_task) cgsubmit_task = PythonOperator(task_id="submit_metadata", python_callable=submit_metadata, provide_context=True, dag=dag) cgsubmit_task.set_upstream(metadata_task) gtupload_task = PythonOperator(task_id="upload_sample", python_callable=upload_sample, provide_context=True, dag=dag) gtupload_task.set_upstream(cgsubmit_task)
'out_dirpath': './openeo_job/result/' }, { 'name': 'save_raster' }, { 'name': 'get_cube_metadata' }, { 'name': 'to_pickle', 'filepath': './openeo_job/result/save_13.dc;str' }] }, queue='process') nir_2.set_upstream([dc_0]) red_3.set_upstream([dc_0]) blue_4.set_upstream([dc_0]) sub_5.set_upstream([nir_2, red_3]) p1_6.set_upstream([red_3]) p2_7.set_upstream([blue_4]) sum_8.set_upstream([nir_2, p1_6, p2_7]) div_9.set_upstream([sub_5, sum_8])
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("sanger_variant_calling", default_args=default_args, schedule_interval=None, concurrency=500, max_active_runs=500) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) run_sanger_callers_task = PythonOperator( task_id="run_sanger_callers", python_callable=run_sanger_callers, provide_context=True, dag=dag) run_sanger_callers_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(run_sanger_callers_task)