'start_date': datetime.datetime(2020, 01, 01), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("msisensor", default_args=default_args, schedule_interval=None, concurrency=10000, max_active_runs=2000) start_analysis_run_task = PythonOperator(task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) msisensor_task = PythonOperator(task_id='msisensor', python_callable=run_msisensor, provide_context=True, dag=dag) msisensor_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag)
from datetime import datetime dag = DAG( dag_id = 'my_first_dag', start_date = datetime(2020,10,31), schedule_interval = '0 2 * * *') def print_hello(): print("hello!") return "hello!" def print_goodbye(): print("goodbye!") return "goodbye!" print_hello = PythonOperator( task_id = 'print_hello', #python_callable param points to the function you want to run python_callable = print_hello, #dag param points to the DAG that this task is a part of dag = dag) print_goodbye = PythonOperator( task_id = 'print_goodbye', python_callable = print_goodbye, dag = dag) #Assign the order of the tasks in our DAG print_hello >> print_goodbye
dag = DAG( dag_id = 'week5_2', start_date = datetime(2020,11,8), # 날짜가 미래인 경우 실행이 안됨 schedule_interval = '@once', # 적당히 조절 max_active_runs = 1, concurrency=2, catchup=False ) dir_path = os.path.dirname(os.path.realpath(__file__)) print(dir_path) print(os.getcwd()) prev_task = PythonOperator( task_id = 'create_tables', python_callable = create_tables, dag = dag) for table in tables: s3_key=s3_key_prefix+'/'+table+'.tsv' postgrestos3 = PostgresToS3Operator( table="public."+table, s3_bucket=s3_bucket, s3_key=s3_key, data_dir=local_dir, dag=dag, task_id="Postgres_to_S3"+"_"+table ) s3toredshift = S3ToRedshiftOperator(
'email_on_retry': False } # Set concurrency and max_active_runs to 1, preventing more than one dag instance # from being created. dag = DAG(dag_name, default_args=task_args, concurrency=1, max_active_runs=1, schedule_interval=schedule_interval) get_env = PythonOperator( task_id='get-config-from-s3', python_callable=ConfigGetter(), dag=dag) set_variables = PythonOperator( task_id='set-variables', python_callable=BootStrapper(), dag=dag) cleanup = BashOperator( task_id='cleanup', bash_command=rm_config, trigger_rule='all_done', dag=dag) set_variables.set_upstream(get_env) cleanup.set_upstream(set_variables)
'''This is a function that will run within the DAG execution''' time.sleep(random_base) def connect_to_monary_and_print_aggregation(ds, **kwargs): m = Monary() pipeline = [{"$group": {"_id": "$state", "totPop": {"$sum": "$pop"}}}] states, population = m.aggregate("zips", "data", pipeline, ["_id", "totPop"], ["string:2", "int64"]) strs = list(map(lambda x: x.decode("utf-8"), states)) result = list("%s: %d" % (state, pop) for (state, pop) in zip(strs, population)) print (result) return 'Whatever you return gets printed in the logs' run_this = PythonOperator( task_id='connect_to_monary_and_print_aggregation', provide_context=True, python_callable=connect_to_monary_and_print_aggregation, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator( task_id='sleep_for_'+str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': i}, dag=dag) task.set_upstream(run_this)
auth=basic_auth("neo4j", "password")) session = driver.session() result = session.run(query_code, inputs) # ret = {x:[] for x in outputs} try: for record in result: print(record) for out in output_vars: if (isinstance(record[out], bytes)): ret[out].append(record[out].decode("utf-8")) else: ret[out].append(record[out]) except: print("Came into except ") if (query_type == "mongoDB"): if (query_code == "mp_ht_in_total"): print(inputs["num"]) ret = mongoQuery.mp_ht_in_total(limit=inputs["num"]) print(ret) for k, v in ret.items(): context['task_instance'].xcom_push(k, v) print("========================================") return ret task_0 = PythonOperator(task_id='node_{}'.format("n1"), python_callable=execute_query, op_kwargs={'node_name': "n1"}, provide_context=True, dag=dag)
} dag = DAG("delly", default_args=default_args, schedule_interval=None, concurrency=10000, max_active_runs=2000) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) validate_sample_task = PythonOperator( task_id="validate_sample", python_callable=validate_sample, provide_context=True, dag=dag) validate_sample_task.set_upstream(start_analysis_run_task) delly_task = PythonOperator( task_id="delly_genotype", python_callable=run_delly, provide_context=True, dag=dag) delly_task.set_upstream(validate_sample_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run",
# 'end_date': datetime(2016, 1, 1), } dag = DAG( 'example_twitter_dag', default_args=default_args, schedule_interval="@daily") # -------------------------------------------------------------------------------- # This task should call Twitter API and retrieve tweets from yesterday from and to # for the four twitter users (Twitter_A,..,Twitter_D) There should be eight csv # output files generated by this task and naming convention # is direction(from or to)_twitterHandle_date.csv # -------------------------------------------------------------------------------- fetch_tweets = PythonOperator( task_id='fetch_tweets', python_callable=fetchtweets, dag=dag) # -------------------------------------------------------------------------------- # Clean the eight files. In this step you can get rid of or cherry pick columns # and different parts of the text # -------------------------------------------------------------------------------- clean_tweets = PythonOperator( task_id='clean_tweets', python_callable=cleantweets, dag=dag) clean_tweets.set_upstream(fetch_tweets) # --------------------------------------------------------------------------------
print("second") def third(): print("third") with DAG('demo_1', description='123', schedule_interval=None, start_date=datetime(2018, 1, 1), catchup=False) as dag: first = PythonOperator( task_id='first', python_callable=first, dag=dag, ) second = PythonOperator( task_id='second', python_callable=second, dag=dag, ) third = PythonOperator( task_id='third', python_callable=third, dag=dag, )
dag=dag, table="trips", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket="udac-data-pipelines", s3_key= "divvy/partitioned/{execution_date.year}/{execution_date.month}/divvy_trips.csv" ) # # TODO: Replace this data quality check with the HasRowsOperator # check_trips = PythonOperator(task_id='check_trips_data', dag=dag, python_callable=check_greater_than_zero, provide_context=True, params={ 'table': 'trips', }) create_stations_table = PostgresOperator( task_id="create_stations_table", dag=dag, postgres_conn_id="redshift", sql=sql_statements.CREATE_STATIONS_TABLE_SQL, ) copy_stations_task = S3ToRedshiftOperator( task_id="load_stations_from_s3_to_redshift", dag=dag, redshift_conn_id="redshift",
end_of_data_pipeline = DummyOperator(task_id='end_of_data_pipeline', dag=dag) pg_unload = PostgresOperator( dag=dag, task_id='pg_unload', sql=unload_user_purchase, postgres_conn_id='postgres_default', params={'temp_filtered_user_purchase': temp_filtered_user_purchase}, depends_on_past=True, wait_for_downstream=True) user_purchase_to_s3_stage = PythonOperator( dag=dag, task_id='user_purchase_to_s3_stage', python_callable=_local_to_s3, op_kwargs={ 'filename': temp_filtered_user_purchase, 'key': temp_filtered_user_purchase_key, }, ) remove_local_user_purchase_file = PythonOperator( dag=dag, task_id='remove_local_user_purchase_file', python_callable=remove_local_file, op_kwargs={ 'filelocation': temp_filtered_user_purchase, }, ) movie_review_to_s3_stage = PythonOperator(
fetch_logs=True, tags='aiflow_example_run', dag=dag) t2 = QuboleOperator( task_id='hive_s3_location', command_type="hivecmd", script_location="s3n://dev.canopydata.com/airflow/show_table.hql", notfiy=True, tags=['tag1', 'tag2'], trigger_rule="all_done", dag=dag) t3 = PythonOperator( task_id='compare_result', provide_context=True, python_callable=compare_result, trigger_rule="all_done", dag=dag) t3.set_upstream(t1) t3.set_upstream(t2) options = ['hadoop_jar_cmd', 'presto_cmd', 'db_query', 'spark_cmd'] branching = BranchPythonOperator( task_id='branching', python_callable=lambda: random.choice(options), dag=dag) branching.set_upstream(t3)
""" Simple subdag example """ from airflow import DAG from airflow.operators import PythonOperator from twitter_airflow import csv_to_sqlite, identify_popular_links from datetime import datetime, timedelta default_args = { 'owner': 'admin', 'depends_on_past': False, 'start_date': datetime(2016, 1, 1), 'retries': 1, 'retry_delay': timedelta(minutes=5), } subdag = DAG('generate_twitter_dags.insert_and_id_pop', default_args=default_args) move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite', provide_context=True, python_callable=csv_to_sqlite, dag=subdag) id_popular = PythonOperator(task_id='identify_popular_links', provide_context=True, python_callable=identify_popular_links, dag=subdag, params={'write_mode': 'a'}) id_popular.set_upstream(move_tweets_to_sqlite)
} dag = DAG("freebayes", default_args=default_args, schedule_interval=None, concurrency=10000, max_active_runs=2000) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) validate_sample_task = PythonOperator( task_id="validate_sample", python_callable=validate_sample, provide_context=True, dag=dag) validate_sample_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) for contig_name in tracker.util.workflow_common.CONTIG_NAMES: freebayes_task = PythonOperator( task_id="freebayes_" + contig_name, python_callable=run_freebayes,
message = f'''Отчет по объявлению 121288 за 2 апреля Траты: {df.Total_cost[1]} рублей ({Total_cost_diff} %) Показы: {df.view[1]} ({view_diff} %) Клики: {df.click[1]} ({click_diff} %) CTR: {df.CTR[1]} ({CTR_diff} %)''' token = '66117ac9424a6b67d404d24a1cb0fcfec6a150abeb21fb62b1edea6ddda943c35975ca53f7084347b094c' vk_session = vk_api.VkApi(token=token) vk = vk_session.get_api() vk.messages.send(user_id='7768141', random_id=2, message=message) t1 = PythonOperator(task_id='send_vk_report_task', python_callable=send_vk_report, dag=dag) ''' 1. Прочитайте csv файл, который находится по этой ссылке, при помощи библиотеки pandas https://docs.google.com/spreadsheets/d/e/2PACX-1vR-ti6Su94955DZ4Tky8EbwifpgZf_dTjpBdiVH0Ukhsq94jZdqoHuUytZsFZKfwpXEUCKRFteJRc9P/pub?gid=889004448&single=true&output=csv 2. В данных вы найдете информацию о событиях, которые произошли с объявлением 121288 за два дня. Рассчитайте следующие метрики в разрезе каждого дня: количество показов количество кликов CTR сумма потраченных денег То есть для каждой метрики у вас должно получиться два числа - за 2019-04-01 и 2019-04-02 Рассчитать сумму потраченных денег можно по следующей формуле - разделите значение из колонки ad_cost на 1000 и умножьте на количество показов объявления
# Task 3 git clone project bash_git = BashOperator( task_id='bash_git', bash_command='pushd /home/admin/gta_scripts && ./git_clone.sh ' '{{dag_run.conf[' '"git_dire"]}} {{dag_run.conf["git_url"]}} ') # Task 4 change branch of git bash_ch_branch=BashOperator(task_id='bash_ch_branch',bash_command='pushd # /home/admin/gta_scripts && ./change_branch.sh ' '{{dag_run.conf["pro_dire"]}} {{dag_run.conf["git_branch"]}} ') # Task 5 Give status of Git clone git_status=BashOperator(task_id='git_status',bash_command='pushd # /home/admin/gta_scripts && python db_status_git.py -i ' '{{dag_run.conf["user_id"]}} {{dag_run.conf[ # "state_git"]}} {{dag_run.conf["status_git"]}}') git_status = PythonOperator(task_id='git_status', provide_context=True, python_callable=write_db_git) # Task 6 ping the server ping_ser = BashOperator( task_id='ping_ser', bash_command='pushd /home/admin/gta_scripts && ./ping.sh ') # Task 7 params update params_update=BashOperator(task_id='params_update',bash_command='pushd # /home/admin/gta_scripts && python sutas_params_update.py -i ' '{{dag_run.conf["raisebugs"]}} {{dag_run.conf[ # "jiraenv"]}} {{dag_run.conf["loglevel"]}} {{dag_run.conf["slack"]}} {{dag_run.conf["emailnotifications"]}} {{ # dag_run.conf["symmetrickey"]}} {{dag_run.conf["teamsnotifications"]}} {{dag_run.conf["consolidatedmail"]}} {{ # dag_run.conf["enabledatebase"]}} {{dag_run.conf["enabletestmanagement"]}} {{dag_run.conf[ # "enablepushtestartifacts"]}} -d {{dag_run.conf["ref_path"]}} ') docker_params_update = PythonOperator(task_id='docker_params_update',
dag6_task1 = DummyOperator( task_id='test_depends_on_past', depends_on_past=True, dag=dag6,) dag6_task2 = DummyOperator( task_id='test_depends_on_past_2', depends_on_past=True, dag=dag6,) dag6_task2.set_upstream(dag6_task1) # DAG tests that a deadlocked subdag is properly caught dag7 = DAG(dag_id='test_subdag_deadlock', default_args=default_args) subdag7 = DAG(dag_id='test_subdag_deadlock.subdag', default_args=default_args) subdag7_task1 = PythonOperator( task_id='test_subdag_fail', dag=subdag7, python_callable=fail) subdag7_task2 = DummyOperator( task_id='test_subdag_dummy_1', dag=subdag7,) subdag7_task3 = DummyOperator( task_id='test_subdag_dummy_2', dag=subdag7) dag7_subdag1 = SubDagOperator( task_id='subdag', dag=dag7, subdag=subdag7) subdag7_task1.set_downstream(subdag7_task2) subdag7_task2.set_downstream(subdag7_task3) # DAG tests that queued tasks are run
def process_utilization_kpi( parent_dag_name, child_dag_name, start_date, schedule_interval, celery_queue, ss_tech_sites, hostnames_ss_per_site, ss_name, utilization_attributes, config_sites ): #here config site is list of all sites in system_config var utilization_kpi_subdag_dag = DAG( dag_id="%s.%s" % (parent_dag_name, child_dag_name), schedule_interval=schedule_interval, start_date=start_date, ) for service in utilization_attributes: sv_to_ds_mapping[service.get("service_name")] = { "data_source": service.get("data_source"), "sector_type": service.get("sector_type") } def get_calculated_ss_data(): ss_data = redis_hook_util_10.rget("calculated_ss_utilization_kpi") combined_site_data = {} for site_data in ss_data: site_data = eval(site_data) combined_site_data.update(site_data) return combined_site_data #To create SS dict def format_data(**kwargs): device_type = kwargs.get("params").get("technology") utilization_attributes = kwargs.get("params").get("attributes") machine_name = kwargs.get("params").get("machine_name") ss_kpi_dict = { 'site_name': 'unknown', 'device_name': 'unknown', 'service_name': 'unknown', 'ip_address': 'unknown', 'severity': 'unknown', 'age': 'unknown', 'data_source': 'unknown', 'current_value': 'unknown', 'warning_threshold': 'unknown', 'critical_threshold': 'unknown', 'check_timestamp': 'unknown', 'sys_timestamp': 'unknown', 'refer': 'unknown', 'min_value': 'unknown', 'max_value': 'unknown', 'avg_value': 'unknown', 'machine_name': 'unknown' } ss_data = redis_hook_util_10.rget("calculated_utilization_%s_%s" % (device_type, machine_name)) cur_processing_time = backtrack_x_min( time.time(), 300 ) + 120 # this is used to rewind the time to previous multiple of 5 value so that kpi can be shown accordingly ss_devices_list = [] for ss_device in ss_data: ss_device = eval(ss_device) hostname = ss_device.get('hostname') for service in ss_device.get('services'): data_source = sv_to_ds_mapping.get(service).get("data_source") pmp_type = sv_to_ds_mapping.get(service).get("sector_type") thresholds = get_severity_values(service) ss_kpi_dict['critical_threshold'] = thresholds[0] ss_kpi_dict['data_source'] = data_source ss_kpi_dict['site_name'] = ss_device.get('site') #TODO: ok and unknown are only 2 sev for ss we can incluudethis in rules later ss_kpi_dict['service_name'] = service ss_kpi_dict['machine_name'] = machine_name ss_kpi_dict['check_timestamp'] = cur_processing_time ss_kpi_dict['device_name'] = ss_device.get('hostname') ss_kpi_dict['sys_timestamp'] = cur_processing_time ss_kpi_dict['refer'] = ss_device.get("%s_sector" % (pmp_type)) ss_kpi_dict['ip_address'] = ss_device.get('ipaddress') ss_kpi_dict['warning_threshold'] = thresholds[1] if not isinstance(ss_device.get(service), dict): #handling cur_value if it is greater than 100 cur_value = ss_device.get(service) if cur_value and cur_value != None: cur_value = ss_device.get(service) try: if isinstance( curr_value, float ) and cur_value and cur_value > 100.00: cur_value = 100 except Exception: logging.error( "Exception while handling above 100 entries") ss_kpi_dict['severity'] = calculate_severity( service, ss_device.get(service)) ss_kpi_dict['age'] = calculate_age( hostname, ss_kpi_dict['severity'], ss_device.get('device_type'), cur_processing_time, service) ss_kpi_dict['current_value'] = cur_value ss_kpi_dict['avg_value'] = cur_value ss_kpi_dict['min_value'] = cur_value ss_kpi_dict['max_value'] = cur_value if ss_kpi_dict['current_value'] != None: ss_devices_list.append(ss_kpi_dict.copy()) else: for data_source in ss_device.get(service): ds_values = ss_device.get(service) curr_value = ss_device.get(service).get(data_source) if isinstance(curr_value, str): try: curr_value = float(curr_value) if isinstance(curr_value, float): if curr_value > 100.00: curr_value = 100 except Exception: logging.error("Unable to convert to float") else: if curr_value > 100.00: curr_value = 100 ss_kpi_dict['data_source'] = data_source ss_kpi_dict['severity'] = calculate_severity( service, ds_values.get(data_source)) ss_kpi_dict['age'] = calculate_age( hostname, ss_kpi_dict['severity'], ss_device.get('device_type'), cur_processing_time, service) ss_kpi_dict['current_value'] = curr_value ss_kpi_dict['avg_value'] = curr_value ss_kpi_dict['min_value'] = curr_value ss_kpi_dict['max_value'] = curr_value if ss_kpi_dict['current_value'] != None: ss_devices_list.append(ss_kpi_dict.copy()) try: if len(ss_devices_list) > 0: redis_hook_util_10.rpush( "formatted_util_%s_%s" % (device_type, machine_name), ss_devices_list) else: logging.info("No %s device found in %s after formatting " % (device_type, machine_name)) except Exception: logging.error("Unable to push formatted SS data to redis") def get_required_data_ss(**kwargs): site_name = kwargs.get("params").get("site_name") device_type = kwargs.get("params").get("technology") utilization_attributes = kwargs.get("params").get("attributes") if "vrfprv" in site_name: memc_con = vrfprv_memc_con elif "pub" in site_name: memc_con = pub_memc_con else: memc_con = memc_con_cluster ss_data_dict = {} all_ss_data = [] if site_name not in hostnames_ss_per_site.keys(): logging.warning("No SS devices found for %s" % (site_name)) return 1 for hostnames_dict in hostnames_ss_per_site.get(site_name): host_name = hostnames_dict.get("hostname") ip_address = hostnames_dict.get("ip_address") ss_data_dict['hostname'] = host_name ss_data_dict['ipaddress'] = ip_address ss_data_dict['site_name'] = site_name if host_name not in down_and_unresponsive_devices: for service in utilization_attributes: ss_data_dict[service.get('service_name')] = memc_con.get( service.get('utilization_key') % (host_name)) all_ss_data.append(ss_data_dict.copy()) if len(all_ss_data) == 0: logging.info("No data Fetched ! Aborting Successfully") return 0 try: #redis_hook_util_10.rpush("%s_%s"%(device_type,site_name),all_ss_data) print "++++++++++++" print site_name.split("_")[0] redis_hook_util_10.rpush( "%s_%s" % (device_type, site_name.split("_")[0]), all_ss_data) except Exception: logging.warning("Unable to insert ss data into redis") #pprint(all_ss_data) def calculate_utilization_data_ss(**kwargs): machine_name = kwargs.get("params").get("machine_name") device_type = kwargs.get("params").get("technology") utilization_attributes = kwargs.get("params").get("attributes") devices_data_dict = redis_hook_util_10.rget( "%s_%s" % (device_type, machine_name)) if len(devices_data_dict) == 0: logging.info("No Data found for ss %s " % (machine_name)) return 1 ss_data = [] for devices in devices_data_dict: devices = eval(devices) site_name = devices.get("site_name") devices['site'] = site_name devices['device_type'] = device_type for service_attributes in utilization_attributes: #loop for the all the configured services service = service_attributes.get('service_name') if service_attributes.get('isKpi'): if 'services' in devices.keys( ) and devices.get('services') != None: devices.get('services').append(service) elif service and devices.get('services') == None: devices['services'] = [service] else: devices['services'] = [] if service_attributes.get('isKpi'): utilization_type = service_attributes.get( "utilization_type") capacity = None if "capacity" in service_attributes.keys(): capacity = service_attributes.get("capacity") try: formula = kpi_rules.get(service).get('formula') devices[service] = eval(formula) except Exception: print "Exception in calculating data" pass else: continue #ip_ul_mapper[devices.get('ipaddress')] = devices ss_data.append(devices.copy()) #ss_utilization_list.append(ip_ul_mapper.copy()) key = "calculated_utilization_%s_%s" % (device_type, machine_name) redis_hook_util_10.rpush(key, ss_data) print "Setting ....." print "calculated_utilization_%s_%s" % (device_type, machine_name) #redis_hook_util_10.rpush("calculated_ss_utilization_kpi",ss_utilization_list) def aggregate_utilization_data(*args, **kwargs): print "Aggregating Data" machine_name = kwargs.get("params").get("machine_name") device_type = kwargs.get("params").get("technology") #device_type = kwargs.get("params").get("device_type") formatted_data = redis_hook_util_10.rget("formatted_util_%s_%s" % (device_type, machine_name)) machine_data = [] for site_data in formatted_data: machine_data.append(eval(site_data)) redis_hook_util_10.set( "aggregated_utilization_%s_%s" % (machine_name, device_type), str(machine_data)) machine_names = set([site.split("_")[0] for site in ss_tech_sites]) config_machines = set([site.split("_")[0] for site in config_sites]) aggregate_dependency_ss = {} aggregate_dependency_bs = {} calculate_task_list = {} format_task_list = {} #TODo Remove this if ss >> bs task # calculate_utilization_lost_ss_bs_task = PythonOperator( # task_id = "calculate_bs_utilization_lost_ss", # provide_context=True, # python_callable=calculate_utilization_data_bs, # params={"lost_n_found":True}, # dag=utilization_kpi_subdag_dag # ) for each_machine_name in machine_names: if each_machine_name in config_machines: aggregate_utilization_data_ss_task = PythonOperator( task_id="aggregate_utilization_ss_%s" % each_machine_name, provide_context=True, python_callable=aggregate_utilization_data, params={ "machine_name": each_machine_name, "technology": ss_name }, dag=utilization_kpi_subdag_dag, queue=O7_CALC_Q, trigger_rule='all_done') aggregate_dependency_ss[ each_machine_name] = aggregate_utilization_data_ss_task calculate_utilization_data_ss_task = PythonOperator( task_id="calculate_ss_utilization_kpi_of_%s" % each_machine_name, provide_context=True, trigger_rule='all_done', python_callable=calculate_utilization_data_ss, params={ "machine_name": each_machine_name, "technology": ss_name, 'attributes': utilization_attributes }, dag=utilization_kpi_subdag_dag, queue=O7_CALC_Q, ) format_data_ss_task = PythonOperator( task_id="format_data_of_ss_%s" % each_machine_name, provide_context=True, python_callable=format_data, trigger_rule='all_done', params={ "machine_name": each_machine_name, "technology": ss_name, 'attributes': utilization_attributes }, dag=utilization_kpi_subdag_dag, queue=celery_queue, ) calculate_task_list[ each_machine_name] = calculate_utilization_data_ss_task calculate_utilization_data_ss_task >> format_data_ss_task format_data_ss_task >> aggregate_utilization_data_ss_task #we gotta create teh crazy queries WTF this is so unsafe INSERT_QUERY = INSERT_HEADER % ("nocout_" + each_machine_name) + INSERT_TAIL UPDATE_QUERY = UPDATE_HEADER % ("nocout_" + each_machine_name) + UPDATE_TAIL INSERT_QUERY = INSERT_QUERY.replace('\n', '') UPDATE_QUERY = UPDATE_QUERY.replace('\n', '') #ss_name == Device_type if not DEBUG: insert_data_in_mysql = MySqlLoaderOperator( task_id="upload_data_%s" % (each_machine_name), dag=utilization_kpi_subdag_dag, query=INSERT_QUERY, #data="", redis_key="aggregated_utilization_%s_%s" % (each_machine_name, ss_name), redis_conn_id="redis_hook_util_10", mysql_conn_id='mysql_uat', queue=O7_CALC_Q, trigger_rule='all_done') update_data_in_mysql = MySqlLoaderOperator( task_id="update_data_%s" % (each_machine_name), query=UPDATE_QUERY, #data="", redis_key="aggregated_utilization_%s_%s" % (each_machine_name, ss_name), redis_conn_id="redis_hook_util_10", mysql_conn_id='mysql_uat', dag=utilization_kpi_subdag_dag, queue=O7_CALC_Q, trigger_rule='all_done') update_data_in_mysql << aggregate_utilization_data_ss_task insert_data_in_mysql << aggregate_utilization_data_ss_task db_list = [] for each_site_name in ss_tech_sites: if each_site_name in config_sites: machine = each_site_name.split("_")[0] get_required_data_ss_task = PythonOperator( task_id="get_utilization_data_of_ss_%s" % each_site_name, provide_context=True, trigger_rule='all_done', python_callable=get_required_data_ss, params={ "site_name": each_site_name, "technology": ss_name, 'attributes': utilization_attributes }, dag=utilization_kpi_subdag_dag, queue=celery_queue) get_required_data_ss_task >> calculate_task_list.get(machine) #calculate_utilization_data_ss_task >> format_data_ss_task #calculate_utilization_data_ss_task >> calculate_utilization_data_bs_task # try: # aggregate_dependency_ss[machine_name] << format_data_ss_task # except: # logging.info("Site Not Found %s"%(machine_name)) # pass else: logging.info("Skipping %s" % (each_site_name)) return utilization_kpi_subdag_dag
if table_exists: sqls = [drop_table, create_table, load_data] for i in sqls: redshift_call(i) else: sqls = [create_table, load_data] for i in sqls: redshift_call(i) postgres_to_local_csv = PythonOperator( task_id='postgres_to_local_csv', provide_context=True, python_callable=get_orders_with_bellhops, dag=dag) local_csv_to_s3 = PythonOperator( task_id='local_csv_to_s3', provide_context=True, python_callable=store_orders_with_bellhops, dag=dag) s3_to_redshift = PythonOperator( task_id='s3_to_redshift', provide_context=True, python_callable=transfer_orders_with_bellhops, dag=dag) local_csv_to_s3.set_upstream(postgres_to_local_csv) s3_to_redshift.set_upstream(local_csv_to_s3)
'h': '$time.h' }, 'hourly': { '$sum': 1 } } }, { '$out': tmp_created_collection_per_hour_name }] results = db.logs.aggregate(pipeline) print("Aggregated hour metrics") return 'Whatever you return gets printed in the logs' run_this = PythonOperator(task_id='connect_to_mongodb_and_aggregate_day', provide_context=True, python_callable=connect_to_mongodb_and_aggregate_day, dag=dag) run_this_also = PythonOperator( task_id='connect_to_mongodb_and_aggregate_hour', provide_context=True, python_callable=connect_to_mongodb_and_aggregate_hour, dag=dag) run_this_also.set_upstream(run_this) send_email_notification_flow_successful = EmailOperator( task_id='send_email_notification_flow_successful', to="*****@*****.**", subject='custom email from airflow', html_content="{{ params['foo'](execution_date) }}",
sms_result = context['task_instance'].xcom_pull(task_ids='send_sms') mail_result = context['task_instance'].xcom_pull(task_ids='send_mail') call_result = context['task_instance'].xcom_pull(task_ids='send_call') # pdb.set_trace() print('success') # t1, t2 and t3 are examples of tasks created by instatiating operators p0 = PythonOperator( task_id='set_group', python_callable=set_group, dag=dag) p1 = PythonOperator( task_id='set_call', provide_context=True, python_callable=set_call, dag=dag) p2 = PythonOperator( task_id='set_mail', provide_context=True, python_callable=set_mail, dag=dag) p3 = PythonOperator( task_id='set_sms', provide_context=True, python_callable=set_sms, dag=dag)
# t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id="print_date", bash_command="date", dag=dag) t2 = BashOperator(task_id="sleep", bash_command="sleep 5", retries=3, dag=dag) templated_command = """ {% for i in range(5) %} echo "{{ ds }}" echo "{{ macros.ds_add(ds, 7)}}" echo "{{ params.my_param }}" {% endfor %} """ t3 = BashOperator( task_id="templated", bash_command=templated_command, params={"my_param": "Parameter I passed in"}, dag=dag, ) t4 = PythonOperator( task_id="python_code", python_callable=some_function, dag=dag ) t2.set_upstream(t1) t3.set_upstream(t1) t4.set_upstream(t1)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("sanger_bwa", default_args=default_args, schedule_interval=None, concurrency=500, max_active_runs=500) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) run_bwa_task = PythonOperator( task_id="run_bwa", python_callable=run_bwa, provide_context=True, dag=dag) run_bwa_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(run_bwa_task)
schedule_interval='25 10 * * *', ) run_file_1 = BashOperator( task_id='run_file_1', bash_command=f'python /home/airflow/gcs/data/file_1.py', email_on_failure=False, dag=dag) run_file_2 = BashOperator( task_id='run_file_2', bash_command=f"python /home/airflow/gcs/data/file_2.py", email_on_failure=False, dag=dag) push_1 = PythonOperator(task_id='xom_push_try', provide_context=True, python_callable=push, dag=dag) pull_1 = PythonOperator(task_id='xom_pull_try', provide_context=True, python_callable=pull, dag=dag) x_com_push_try = BashOperator(task_id='bigquery_ls', bash_command="bq ls", email_on_failure=False, dag=dag) x_com_push_try >> run_file_1 >> [run_file_2, push_1 >> pull_1]
'retry_delay': timedelta(minutes=5), } dag = DAG("filter-vcf", default_args=default_args, schedule_interval=None, concurrency=20000, max_active_runs=20000) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) filter_task = PythonOperator( task_id="filter_variants", python_callable=filter_variants, provide_context=True, dag=dag) filter_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(filter_task)
def waitfor_gatk(run_id='runid2', task_id='taskid2'): update_key = run_id + '.' + task_id while not r.get(update_key): print("Job in progress") time.sleep(30) else: print("Completed task GATK") t1 = BashOperator( task_id='bwa_cc', bash_command= 'export KUBECONFIG=/root/.kube/kind-config-kind && kubectl apply -f /tmp/bwapod.yaml', dag=dag) t2 = PythonOperator(task_id='bwa_wait', python_callable=waitfor_bwa, op_kwargs={}, dag=dag) t3 = BashOperator( task_id='gatk_cc', bash_command= 'export KUBECONFIG=/root/.kube/kind-config-kind && kubectl apply -f /tmp/gatkpod.yaml', dag=dag) t4 = PythonOperator(task_id='gatk_wait', python_callable=waitfor_gatk, op_kwargs={}, dag=dag) t1 >> t2 >> t3 >> t4
# clean the entire bucket private = [(private_bucket, name) for name in hook.list(private_bucket)] shared = [(shared_bucket, name) for name in hook.list(shared_bucket)] for bucket_name, object_name in private + shared: logging.info("Deleting gs://{}/{}".format(bucket_name, object_name)) hook.delete(bucket_name, object_name) total += 1 logging.info("Deleted {} objects".format(total)) clean_processor_a = PythonOperator( task_id="clean_processor_a", python_callable=clean_buckets, op_kwargs={ "private_bucket": BUCKET_PRIVATE_A, "shared_bucket": BUCKET_SHARED_A, "google_cloud_storage_conn_id": PRIO_A_CONN, }, dag=dag, ) clean_processor_b = PythonOperator( task_id="clean_processor_b", python_callable=clean_buckets, op_kwargs={ "private_bucket": BUCKET_PRIVATE_B, "shared_bucket": BUCKET_SHARED_B, "google_cloud_storage_conn_id": PRIO_B_CONN, }, dag=dag, )
html_content='Check out the latest!!', files=['{}/latest_links.txt'.format(RAW_TWEET_DIR)], dag=dag) sub = SubDagOperator(subdag=subdag, task_id='insert_and_id_pop', trigger_rule='one_success', dag=dag) clear_latest = BashOperator(bash_command='rm -rf {}/latest_links.txt'.format( RAW_TWEET_DIR), task_id='clear_latest', dag=dag) gen_search_terms.set_upstream(fill_search_terms) for term in SEARCH_TERMS: term_without_punctuation = re.sub(r'\W+', '', term) simple_search = PythonOperator( task_id='search_{}_twitter'.format(term_without_punctuation), provide_context=True, python_callable=search_twitter, dag=dag, params={'query': term}) simple_search.set_upstream(gen_search_terms) simple_search.set_downstream(sub) sub.set_downstream(email_links) email_links.set_downstream(clear_latest)
'owner': 'ryan', 'depends_on_past': False, 'start_date': datetime.utcnow(), 'retries': 1, 'retry_delay': timedelta(minutes=5), } # Run at the top of the hour Monday to Friday. # Note: This doesn't line up with the market hours of # 10PM Sunday till 10PM Friday GMT. dag = DAG(dag_id='stocks', default_args=args, schedule_interval='0 * * * 1,2,3,4,5', dagrun_timeout=timedelta(seconds=30)) # loop through the lob's we want to use to build up our dag for stock in stocks: get_stocks_task = \ PythonOperator(task_id='get_stocks', provide_context=True, op_kwargs={"stock": stock}, python_callable=get_stocks, dag=dag) cache_latest_stocks_task = \ PythonOperator(task_id='cache_latest_stocks', provide_context=True, python_callable=cache_latest_stocks, dag=dag) get_stocks_task.set_downstream(cache_latest_stocks_task)
'start_date': datetime.now(), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=1), } # DAG is scheduled to run every 8 hours dag = DAG('PostTweet', schedule_interval=timedelta(hours=8), default_args=default_args) # This dag will stage all the tweets to the csv file t1 = PythonOperator(task_id='stage_tweets', python_callable=stage_tweets, dag=dag) #This Dag will commit all the tweets to the csv file t2 = PythonOperator(task_id='commit_tweets', python_callable=commit_tweets, dag=dag) # This dag is used to send the tweets to twitter t3 = PythonOperator(task_id='post_status', python_callable=post_tweet, dag=dag) # Backup all the files and tweets to google drive t4 = PythonOperator(task_id='backup', python_callable=upload.main, dag=dag) t1.set_downstream(t2) t2.set_downstream(t3)
start_date=datetime.datetime(2018, 1, 1, 0, 0, 0, 0), end_date=datetime.datetime(2018, 12, 1, 0, 0, 0, 0), schedule_interval="@monthly", max_active_runs=1 ) create_trips_table = PostgresOperator( task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=sql.CREATE_TRIPS_TABLE_SQL ) copy_trips_task = PythonOperator( task_id='load_trips_from_s3_to_redshift', dag=dag, python_callable=load_trip_data_to_redshift, provide_context=True, ) check_trips = PythonOperator( task_id='check_trips_data', dag=dag, python_callable=check_greater_than_zero, provide_context=True, params={ 'table': 'trips', } ) create_stations_table = PostgresOperator( task_id="create_stations_table",
'depends_on_past': False, 'start_date': dt.datetime.strptime('2020-03-24T00:00:00', '%Y-%m-%dT%H:%M:%S'), 'provide_context': True } # creating a new dag dag = DAG('dataflow_process_dag', default_args=default_args, schedule_interval='0 0 * * 2', max_active_runs=1) # Integrating different operatortasks in airflow dag # Integrating read_data operator in airflow dag read_table = PythonOperator(task_id='read_table', python_callable=read_data, op_kwargs={'fig_path': fig_path}, dag=dag) # Integrating data_report operator in airflow dag data_report = PythonOperator(task_id='data_report', python_callable=data_report, op_kwargs={'fig_path': fig_path}, dag=dag) # Integrating plots operator in airflow dag plots = PythonOperator(task_id='var_dist_plots', python_callable=plot_var_distributions, op_kwargs={'fig_path': fig_path}, dag=dag) # Integrating train_test operator in airflow dag train_test = PythonOperator(task_id='train_test', python_callable=make_train_test, op_kwargs={'fig_path': fig_path},
for dag in dags: if not os.path.exists(dag.fileloc): logging.info("After checking DAG '" + str(dag) + "', the Python definition file DOES NOT exist.") entries_to_delete.append(dag) else: logging.info("After checking DAG '" + str(dag) + "', the Python definition file does exist.") logging.info("Process will be Deleting the DAG(s) from the DB:") for entry in entries_to_delete: logging.info("\tEntry: " + str(entry)) logging.info("Process will be Deleting " + str(len(entries_to_delete)) + " DAG(s)") if ENABLE_DELETE: logging.info("Performing Delete...") for entry in entries_to_delete: session.delete(entry) logging.info("Finished Performing Delete") else: logging.warn("You're opted to skip deleting the DAG entries!!!") logging.info("Finished Running Clear Process") clear_missing_dags = PythonOperator(task_id='clear_missing_dags', python_callable=clear_missing_dags_fn, provide_context=True, dag=dag)
import logging from airflow import DAG from airflow.operators import PythonOperator from datetime import datetime, timedelta args = { 'owner': 'airflow', 'start_date': datetime(2019, 4, 1), 'provide_context': True } dag = DAG('spark_count_lines', start_date=datetime(2019, 4, 1), schedule_interval='@monthly', dagrun_timeout=timedelta(minutes=60), default_args=args) def run_spark(**kwargs): import pyspark sc = pyspark.SparkContext() df = sc.textFile('file:////Users/kajariverma/airflow/dags/test.py') logging.info('Number of lines in people.txt = {0}'.format(df.count())) sc.stop() t_main = PythonOperator(task_id='call_spark', dag=dag, python_callable=run_spark)
""" Example with PythonOperator """ from airflow import DAG from airflow.operators import PythonOperator from datetime import datetime, timedelta default_args = { 'owner': 'Samarth', 'start_date': datetime(2016, 03, 15, 12), } # "schedule_interval" is your cron expression you can write any cron expression like unix cron. dag = DAG('airflow_with_python_operator', default_args=default_args, schedule_interval="1 * * * *") def my_function(): '''This is a function that will run within the DAG execution''' return "Check me in the logs" # Note that unlike other example we are using PythonOperator here. # 'python_callable' parameter determines which python functions to # execute. run_this = PythonOperator(task_id='run_my_function', python_callable=my_function, dag=dag)
task_id='setup_jobs', provide_context=True, python_callable=setup_jobs_fn, dag=dag) def collect_results_fn(ds, **kwargs): pprint(kwargs) print(ds) collect_results = PythonOperator( task_id='collect_results', provide_context=True, python_callable=collect_results_fn, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator( task_id='sleep_for_'+str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': float(i)/10}, dag=dag) task.set_upstream(setup_jobs) task.set_downstream(collect_results)
def pusher_dynamic(my_task_id, **kwargs): #print(ds) print("pushing| my task id: "+str(my_task_id)+" Notice, the task operator id is also pushed, imliciltly") print(kwargs) print(kwargs['ti']) kwargs['ti'].xcom_push(key='value from pusher dynamic', value=int(my_task_id) ) return 'Whatever you return gets printed in the logs' def puller_dynamic(my_task_id,**kwargs): ti = kwargs['ti'] pulled_value = ti.xcom_pull(key='value from pusher dynamic', task_ids='push_'+str(my_task_id) ) print ("pulled value based on pusher_id: " +str(pulled_value)) i=1 push1 = PythonOperator(task_id='push_1', provide_context=True,dag=dag,python_callable=pusher) pull1 = PythonOperator(task_id='pull_1', provide_context=True,dag=dag,python_callable=puller) #notice I am pulling based on push_1 id, expeted value to push is 2, for pull is1, b/c we are sending the push_1 id... i=i+1 push2 = PythonOperator(task_id='push_2', provide_context=True,dag=dag,python_callable=pusher) pull2 = PythonOperator(task_id='pull_2', provide_context=True,dag=dag,python_callable=puller) #trying to create a dynamic pusher called pusher_synami, accpeting a counter and pushes it to the MySQL i=i+1 my_task_id=i push3 = PythonOperator(task_id='push_'+str(i), provide_context=True,python_callable=pusher_dynamic,op_kwargs={'my_task_id': my_task_id},dag=dag) pull3 = PythonOperator(task_id='pull_'+str(i), provide_context=True,python_callable=puller_dynamic,op_kwargs={'my_task_id': my_task_id},dag=dag)
# list files in ftp server ftp_files = set(key for key in sftp.listdir("upload") if key.lower().endswith(('.pp', '.nc'))) print(ftp_files) # find the files in ftp but not in s3 upload_files = ftp_files - s3_files print(upload_files) # upload each file in list of files to uploaded for key in upload_files: print('Uploading ' + key) upload(key) dag = DAG(dag_id='ftp-to-s3', default_args={ 'owner': 'airflow', 'start_date': airflow.utils.dates.days_ago(2) }, schedule_interval='* * * * *', dagrun_timeout=timedelta(minutes=60), catchup=False, max_active_runs=1) task = PythonOperator(task_id='ftp', python_callable=ftp, dag=dag) if __name__ == "__main__": dag.cli()
def connect_to_mongodb_and_aggregate_hour(ds, **kwargs): db = MongoClient().test tmp_created_collection_per_hour_name = 'page_per_hour_hits_tmp'; pipeline = [{"$project":{'page': '$PAGE', 'time': { 'y': {'$year':'$DATE'} , 'm':{'$month':'$DATE'}, 'day':{'$dayOfMonth':'$DATE'}, 'h':{'$hour':'$DATE'}}}}, {'$group':{'_id':{'p':'$page','y':'$time.y','m':'$time.m','d':'$time.day', 'h':'$time.h'}, 'hourly':{'$sum':1}}},{'$out': tmp_created_collection_per_hour_name}] results = db.logs.aggregate(pipeline) print("Aggregated hour metrics") return 'Whatever you return gets printed in the logs' run_this = PythonOperator( task_id='connect_to_mongodb_and_aggregate_day', provide_context=True, python_callable=connect_to_mongodb_and_aggregate_day, dag=dag) run_this_also = PythonOperator( task_id='connect_to_mongodb_and_aggregate_hour', provide_context=True, python_callable=connect_to_mongodb_and_aggregate_hour, dag=dag) run_this_also.set_upstream(run_this) send_email_notification_flow_successful = EmailOperator( task_id='send_email_notification_flow_successful', to="*****@*****.**", subject='custom email from airflow', html_content="{{ params['foo'](execution_date) }}", params=params, dag=dag) send_email_notification_flow_successful.set_upstream(run_this_also)
dag = DAG(dag_id='example_python_operator', default_args=args) def my_sleeping_function(random_base): '''This is a function that will run within the DAG execution''' time.sleep(random_base) def print_context(ds, **kwargs): pprint(kwargs) print(ds) return 'Whatever you return gets printed in the logs' run_this = PythonOperator(task_id='print_the_context', provide_context=True, python_callable=print_context, dag=dag) for i in range(10): ''' Generating 10 sleeping task, sleeping from 0 to 9 seconds respectively ''' task = PythonOperator(task_id='sleep_for_' + str(i), python_callable=my_sleeping_function, op_kwargs={'random_base': float(i) / 10}, dag=dag) task.set_upstream(run_this)
conn = sqlite.get_conn() query = """select * from tweets where created > date('now', '-1 days') and urls is not null order by favorite_count""" df = pd.read_sql_query(query, conn) df.urls = df.urls.map(ast.literal_eval) cntr = Counter(itertools.chain.from_iterable(df.urls.values)) with open('{}/latest_links.txt'.format(directory), write_mode) as latest: wrtr = writer(latest) wrtr.writerow(['url', 'count']) wrtr.writerows(cntr.most_common(5)) simple_search = PythonOperator(task_id='search_twitter', provide_context=True, python_callable=search_twitter, dag=dag, params={'query': '#python'}) move_tweets_to_sqlite = PythonOperator(task_id='csv_to_sqlite', provide_context=True, python_callable=csv_to_sqlite, dag=dag) id_popular = PythonOperator(task_id='identify_popular_links', provide_context=True, python_callable=identify_popular_links, dag=dag)
end_of_data_pipeline = DummyOperator(task_id='end_of_data_pipeline', dag=dag) pg_unload = PostgresOperator( dag=dag, task_id='pg_unload', sql=unload_user_purchase, postgres_conn_id='postgres_default', params={'temp_filtered_user_purchase': temp_filtered_user_purchase}, depends_on_past=True, wait_for_downstream=True) user_purchase_to_s3_stage = PythonOperator( dag=dag, task_id='user_purchase_to_s3_stage', python_callable=_local_to_s3, op_kwargs={ 'filename': temp_filtered_user_purchase, 'key': temp_filtered_user_purchase_key, }, ) # remove_local_user_purchase_file = PythonOperator( # dag=dag, # task_id='remove_local_user_purchase_file', # python_callable=remove_local_file, # op_kwargs={ # 'filelocation': temp_filtered_user_purchase, # }, # ) # movie_review_to_s3_stage = PythonOperator(
attachment.add_header("Content-Disposition", "attachment", filename=filename) msg = MIMEMultipart() msg.attach(attachment) msg["Subject"] = str("Resultado Analise Notebook") msg["From"] = "*****@*****.**" msg["Reply-to"] = "*****@*****.**" server = smtplib.SMTP("smtp.gmail.com:587") server.ehlo() server.starttls() server.login("*****@*****.**", "a.24423242") server.sendmail(msg["From"], "*****@*****.**", msg.as_string()) server.quit() def acessNotebook(): print("Acessando Notebook no Swift") runZero = PythonOperator(task_id="Acess_Notebook", provide_context=False, python_callable=acessNotebook, dag=dag) runFirst = PythonOperator(task_id="ExecNotebook", provide_context=False, python_callable=execNotebook, dag=dag) runSecond = PythonOperator(task_id="Get_Output", provide_context=False, python_callable=getOutput, dag=dag) runThird = PythonOperator(task_id="send_email", provide_context=False, python_callable=send_email, dag=dag) runZero.set_downstream(runFirst) runFirst.set_downstream(runSecond) runSecond.set_downstream(runThird)
response=elastic.index(index=INDEX_NAME,doc_type=TYPE_NAME,id=uuid,body=Doc_Source) print('uuid is',str(uuid)) uuid += 1 Variable.set("uuid", uuid) with DAG(dag_id='ETL_Import_Yearly', description='Trade', start_date=datetime(2020,6, 15),end_date= None, schedule_interval='@yearly', default_args=args) as dag: for countrycode, countryname in country_dict.items(): for hslevelcode in hslevel_list: for currency in currency_list: task1 = SeleniumOperator( script = get_df, script_args = [data_folder, countrycode, hslevelcode, currency], task_id = 'Extract_Data_' + countrycode + '_' + hslevelcode + '_' + currency) task2 = PythonOperator( task_id = 'Transform_Data_' + countrycode + '_' + hslevelcode + '_' + currency, # op_kwargs={'countrycode': countrycode}, python_callable = transform_data) task3 = PythonOperator( task_id = 'Load_Data_' + countrycode + '_' + hslevelcode + '_' + currency, op_kwargs = {'countryname': countryname, 'hslevel': hslevelcode, 'currency': currency, 'countrycode': countrycode}, python_callable = load_data) # Set Dependencies task1 >> task2 >> task3
'start_date': datetime(2015, 8, 1), 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), # 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('etl_daily', start_date=datetime(2016, 05, 01), schedule_interval="0 0 14 * MON-FRI", default_args=default_args) t1 = PythonOperator(task_id='test_airflow', python_callable=test_airflow, dag=dag) t2 = PythonOperator(task_id='daily_equity_price_ingest', python_callable=daily_equity_price_ingest, dag=dag) run_this_last = DummyOperator(task_id='run_this_last', dag=dag) t2.set_upstream(t1) run_this_last.set_upstream(t2)
'retries': 1, 'retry_delay': timedelta(minutes=1), 'catchup': False } dag = DAG('sparkify_dag', default_args=default_args, description='Load and transform data in Redshift with Airflow', schedule_interval='0 * * * *') start_operator = DummyOperator(task_id='Begin_execution', dag=dag) create_tables_in_redshift = PythonOperator( task_id="create_tables_in_redshift", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", dag=dag, provide_context=True, python_callable=create_table) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, table="staging_events", redshift_conn_id="redshift", aws_credentials_id="aws_credentials", s3_bucket=s3_bucket, s3_key=log_s3_key, copy_json_option="s3://udacity-dend/log_json_path.json", )
'email': [alert_email], 'email_on_failure': True, 'email_on_retry': False, } # Set concurrency and max_active_runs to 1, preventing more than one dag instance # from being created. dag = DAG(dag_name, default_args=task_args, concurrency=1, max_active_runs=1, schedule_interval=schedule_interval) get_file = PythonOperator( task_id='get-file-from-s3', python_callable=FileGetter(), dag=dag) hello_world_docker_write_logs = BashOperator( task_id='hello-world', bash_command=start_hello_world, trigger_rule=TriggerRule.ALL_SUCCESS, dag=dag) check_read_logs = PythonOperator( task_id='check_read_logs', python_callable=CheckReadLogs(), dag=dag) put_file = PythonOperator( task_id='put-file-to-s3',
args = { 'owner': 'mark', 'depends_on_past': False, 'start_date': datetime.utcnow(), 'retries': 1, 'retry_delay': timedelta(minutes=5), } # Run at the top of the hour Monday to Friday. # Note: This doesn't line up with the market hours of # 10PM Sunday till 10PM Friday GMT. dag = DAG(dag_id='rates', default_args=args, schedule_interval='0 * * * 1,2,3,4,5', dagrun_timeout=timedelta(seconds=30)) get_rates_task = \ PythonOperator(task_id='get_rates', provide_context=True, python_callable=get_rates, dag=dag) cache_latest_rates_task = \ PythonOperator(task_id='cache_latest_rates', provide_context=True, python_callable=cache_latest_rates, dag=dag) get_rates_task.set_downstream(cache_latest_rates_task)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("bcftools", default_args=default_args, schedule_interval=None, concurrency=20000, max_active_runs=20000) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) bcftools_task = PythonOperator( task_id="bcftools", python_callable=bcftools, provide_context=True, dag=dag) bcftools_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(bcftools_task)
'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } dag = DAG("sanger_variant_calling", default_args=default_args, schedule_interval=None, concurrency=500, max_active_runs=500) start_analysis_run_task = PythonOperator( task_id="start_analysis_run", python_callable=start_analysis_run, provide_context=True, dag=dag) run_sanger_callers_task = PythonOperator( task_id="run_sanger_callers", python_callable=run_sanger_callers, provide_context=True, dag=dag) run_sanger_callers_task.set_upstream(start_analysis_run_task) complete_analysis_run_task = PythonOperator( task_id="complete_analysis_run", python_callable=complete_analysis_run, provide_context=True, dag=dag) complete_analysis_run_task.set_upstream(run_sanger_callers_task)