def test_skipping(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task.set_upstream(latest_task) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ datetime.datetime(2016, 1, 1): 'success', datetime.datetime(2016, 1, 1, 12): 'success', datetime.datetime(2016, 1, 2): 'success', }, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ datetime.datetime(2016, 1, 1): 'skipped', datetime.datetime(2016, 1, 1, 12): 'skipped', datetime.datetime(2016, 1, 2): 'success',}, exec_date_to_downstream_state)
def test_skipping(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task.set_upstream(latest_task) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} assert exec_date_to_latest_state == { datetime.datetime(2016, 1, 1): 'success', datetime.datetime(2016, 1, 1, 12): 'success', datetime.datetime(2016, 1, 2): 'success', } downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} assert exec_date_to_downstream_state == { datetime.datetime(2016, 1, 1): 'skipped', datetime.datetime(2016, 1, 1, 12): 'skipped', datetime.datetime(2016, 1, 2): 'success', }
def test_skipping_dagrun(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') self.dag_file_processor._process_task_instances(self.dag, task_instances_list=latest_instances) exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') self.dag_file_processor._process_task_instances(self.dag, task_instances_list=downstream_instances) exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') self.dag_file_processor._process_task_instances(self.dag, task_instances_list=downstream_instances) exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
def load_dm(): loo = LatestOnlyOperator(task_id = "dm_latest_only", dag = dag) dm_tmp_begin = DummyOperator(task_id = "dm_tmp_begin", dag = dag) dm_tmp_end_dm_dims_begin = DummyOperator(task_id = "dm_tmp_end_dm_dims_begin", dag = dag) dm_dims_end_dm_facts_begin = DummyOperator(task_id = "dm_dims_end_dm_facts", dag = dag) c.dds_sats_end_dm_begin >> loo >> dm_tmp_begin for table, sql in c.dm_tmp.items(): po = PostgresOperator( dag = dag, task_id = 'dm_tmp_' + table + '_recreate', sql = sql ) dm_tmp_begin >> po >> dm_tmp_end_dm_dims_begin for table, sql in c.dm_dims.items(): po = PostgresOperator( dag = dag, task_id = 'dm_dim_' + table + '_recreate', sql = sql ) dm_tmp_end_dm_dims_begin >> po >> dm_dims_end_dm_facts_begin for table, sql in c.dm_facts.items(): po = PostgresOperator( dag = dag, task_id = 'dm_fact_' + table + '_recreate', sql = sql ) dm_dims_end_dm_facts_begin >> po return
def create_dag(report, default_args): dag = DAG( report.dag_id, schedule_interval=report.schedule, default_args=default_args ) with dag: test_prefix = "test_" start = LatestOnlyOperator(task_id="start_dag") send_email = PythonOperator( task_id="call_email_function", python_callable=report_notify_email, trigger_rule="all_done", op_kwargs={ "report": report, "email_template_location": SINGLE_EMAIL_TEMPLATE, }, provide_context=True, ) for test in report.tests: t1 = StatusSensor( task_id=test_prefix + test, test_dag_id=test.split(".")[0], test_task_id=test.split(".")[1], ) start >> t1 >> send_email return dag
# 'queue': 'bash_queue', # 'pool': 'backfill', # 'priority_weight': 10, # 'end_date': datetime(2016, 1, 1), } dag = DAG('ats_hourly', default_args=default_args, schedule_interval='0 3,7,10,13,15,17,19,21-23 * * *', catchup=False) python_executable = '~/venv/bin/python3.7' python_script_path = '~/PycharmProjects/TwitterStats' latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag, trigger_rule=TriggerRule.ALL_DONE) # t1, t2 and t3 are examples of tasks created by instantiating operators t1 = BashOperator(task_id='words_trends', bash_command='cd {};{} words.py trends'.format( python_script_path, python_executable), dag=dag, trigger_rule=TriggerRule.ALL_DONE) t2 = BashOperator(task_id='draft_trends', bash_command='cd {};{} drafttrends.py'.format( python_script_path, python_executable), dag=dag, trigger_rule=TriggerRule.ALL_DONE)
from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag # All times in Airflow UTC. Set Start Time in PST? args = general.args conf = general.config schedule = general.schedule['streets'] start_date = general.start_date['streets'] #: Dag spec dag = DAG(dag_id='sidewalk', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for sdif sidewalk_latest_only = LatestOnlyOperator(task_id='sidewalk_latest_only', dag=dag) #: Get sidewalk data from DB get_sidewalk_data = PythonOperator(task_id='get_sidewalk_oci', python_callable=get_sidewalk_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Get sidewalks shapefile from Atlas get_sw_shapefiles = PythonOperator(task_id='get_sidewalk_gis', python_callable=get_sidewalk_gis, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
from dags.water_tests.indicator_bacteria_jobs import get_indicator_bacteria_tests from dags.water_tests.indicator_bacteria_jobs import get_latest_bac_tests from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag, update_json_date args = general.args conf = general.config start_date = general.start_date['indicator_bacteria_tests'] dag = DAG(dag_id='indicator_bacteria_tests', default_args=args, start_date=start_date, schedule_interval=general.schedule['indicator_bacteria_tests']) #: Latest Only Operator for traffic_counts wtr_latest_only = LatestOnlyOperator(task_id='water_latest_only', dag=dag) # TODO - teach me how to be yearly # Pull out all indicator bac tests. get_indicator_bac_tests = PythonOperator( task_id='get_indicator_bac_tests', python_callable=get_indicator_bacteria_tests, op_kwargs={ 'date_start': '01-JUN-2014', 'date_end': (datetime.now() + timedelta(days=5)).strftime('%d-%b-%Y') }, provide_context=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag)
def create_dag( *, dag_id: str, cdr_type: str, start_date: datetime, extract_sql: str, end_date: Optional[datetime] = None, retries: int = 10, retry_delay: timedelta = timedelta(days=1), schedule_interval: Union[str, Interval] = "@daily", indexes: Iterable[str] = ("msisdn_counterpart", "location_id", "datetime", "tac"), data_present_poke_interval: int = 60, data_present_timeout: int = 60 * 60 * 24 * 7, flux_check_poke_interval: int = 60, flux_check_wait_interval: int = 60, flux_check_timeout: int = 60 * 60 * 24 * 7, source_table: Optional[str] = None, staging_view_sql: Optional[str] = None, cluster_field: Optional[str] = None, program: Optional[str] = None, filename: Optional[str] = None, fields: Optional[Dict[str, str]] = None, null: str = "", additional_macros: Dict[str, Union[str, Callable]] = dict(), header: bool = True, delimiter: str = ",", quote: str = '"', escape: str = '"', encoding: Optional[str] = None, ) -> "DAG": """ Create an ETL DAG that will load data from files, or a table within the database. Parameters ---------- dag_id : str Name of the dag cdr_type : {"calls", "sms", "mds", "topups"} Type of CDR data start_date : datetime First date the dag should run for extract_sql : str SQL template. May be an SQL string, or the name of a file in the dags folder. The SQL should output a table with fields matching the corresponding cdr type schema. Where the source data is missing a field, the field must be introduced using NULL::<field_type> as <field_name>. end_date : datetime or None Optionally specify the final day the day should run on retries : int, default 10 Number of times to retry the dag if it fails retry_delay : timedelta, default timedelta(days=1) Delay between retries schedule_interval : str or Interval, default "@daily" Time interval between execution dates. indexes : iterable of str, default ("msisdn_counterpart", "location_id", "datetime", "tac") Fields to create indexes on. data_present_poke_interval : int, default 60 Number of seconds to wait between runs for the data present check data_present_timeout : int, default 604800 Maximum number of seconds to keep checking before failing flux_check_poke_interval : int, default 60 Number of seconds to wait between runs for the data in flux check flux_check_wait_interval : int, default 60 Number of seconds to monitor data when checking for flux flux_check_timeout : int, default 604800 Maximum number of seconds to keep checking before failing source_table : str or None If extracting from a table within the database (e.g. when using a FDW to connect to another db), the schema qualified name of the table. staging_view_sql : str or None If extracting from a table within the database (e.g. when using a FDW to connect to another db), the sql template or name of the template which will be used to create a date limited view of the data. cluster_field : str or None Optionally require that the data tables be 'clustered' on a field, which improves the performance of queries which need to subset based on that field at the cost of a significant increase in ETL time. program : str or None When loading data from files, set to the name of a program to be used when reading them (e.g. zcat to load from compressed csv files). filename : str or None When loading data from files, the filename pattern to be used - may include Airflow macros. fields : dict or None When loading data from files, a mapping of field names to postgres types. null : str, default "" When loading data from files, optionally specify a null value character additional_macros : dict or None Optionally provide additional macros to be available in SQL templates. header : bool, default True Set to False when loading files if the files do not have a header row. delimiter : str, default "," When loading from files, you may specify the delimiter character quote : str, default '"' When loading from files, you may specify the quote character escape : str, default '"' When loading from files, you may specify the escape character encoding : str or None Optionally specify file encoding when loading from files. Returns ------- DAG """ from airflow import DAG from airflow.operators.latest_only_operator import LatestOnlyOperator from flowetl.operators.add_constraints_operator import AddConstraintsOperator from flowetl.operators.analyze_operator import AnalyzeOperator from flowetl.operators.attach_operator import AttachOperator from flowetl.operators.cluster_operator import ClusterOperator from flowetl.operators.create_foreign_staging_table_operator import ( CreateForeignStagingTableOperator, ) from flowetl.operators.create_indexes_operator import CreateIndexesOperator from flowetl.operators.create_staging_view_operator import CreateStagingViewOperator from flowetl.operators.extract_from_foreign_table_operator import ( ExtractFromForeignTableOperator, ) from flowetl.operators.extract_from_view_operator import ExtractFromViewOperator from flowetl.operators.update_etl_table_operator import UpdateETLTableOperator from flowetl.sensors.data_present_sensor import DataPresentSensor from flowetl.sensors.file_flux_sensor import FileFluxSensor from flowetl.sensors.table_flux_sensor import TableFluxSensor args = { "owner": "airflow", "retries": retries, "retry_delay": retry_delay, "postgres_conn_id": "flowdb", "conn_id": "flowdb", "start_date": start_date, "end_date": end_date, } macros = dict(**additional_macros) if source_table is not None: macros["source_table"] = source_table with DAG( dag_id=dag_id, schedule_interval=schedule_interval, default_args=args, user_defined_macros=macros, params=dict(cdr_type=cdr_type), ) as dag: if staging_view_sql is not None and source_table is not None: create_staging_view = CreateStagingViewOperator( task_id="create_staging_view", sql=staging_view_sql, ) extract = ExtractFromViewOperator(task_id="extract", sql=extract_sql, pool="postgres_etl") elif filename is not None and len(fields) > 0: create_staging_view = CreateForeignStagingTableOperator( task_id="create_staging_view", program=program, filename=filename, fields=fields, null=null, header=header, delimiter=delimiter, quote=quote, escape=escape, encoding=encoding, ) extract = ExtractFromForeignTableOperator(task_id="extract", sql=extract_sql, pool="postgres_etl") else: raise TypeError( "Either staging_view_sql and source_table, or filename and fields must be provided." ) check_not_empty = DataPresentSensor( task_id="wait_for_data", mode="reschedule", poke_interval=data_present_poke_interval, timeout=data_present_timeout, ) if filename is not None: check_not_in_flux = FileFluxSensor( task_id="check_not_in_flux", filename=filename, mode="reschedule", poke_interval=flux_check_poke_interval, flux_check_interval=flux_check_wait_interval, timeout=flux_check_timeout, ) else: check_not_in_flux = TableFluxSensor( task_id="check_not_in_flux", mode="reschedule", poke_interval=flux_check_poke_interval, flux_check_interval=flux_check_wait_interval, timeout=flux_check_timeout, ) add_constraints = AddConstraintsOperator(task_id="add_constraints", pool="postgres_etl") add_indexes = CreateIndexesOperator( task_id="add_indexes", index_columns=indexes, pool="postgres_etl", ) attach = AttachOperator(task_id="attach") analyze = AnalyzeOperator( task_id="analyze", target="{{ extract_table }}", pool="postgres_etl", ) latest_only = LatestOnlyOperator(task_id="analyze_parent_only_for_new") analyze_parent = AnalyzeOperator( task_id="analyze_parent", target="{{ parent_table }}", pool="postgres_etl", ) update_records = UpdateETLTableOperator(task_id="update_records") create_staging_view >> check_not_empty >> check_not_in_flux >> extract from_stage = extract if cluster_field is not None: cluster = ClusterOperator(task_id="cluster", cluster_field=cluster_field, pool="postgres_etl") extract >> cluster from_stage = cluster from_stage >> [ add_constraints, add_indexes, ] >> analyze >> attach >> latest_only >> analyze_parent attach >> [update_records, *get_qa_checks()] globals()[dag_id] = dag return dag
from dags.netfile.netfile2_jobs import * # All times in Airflow UTC. Set Start Time in PST? args = general.args conf = general.config schedule = general.schedule['campaign_fin'] start_date = general.start_date['campaign_fin'] cur_yr = general.get_year() #: Dag spec dag = DAG(dag_id='campaign_fin_reports', default_args=args, start_date=start_date, schedule_interval=schedule) campaign_fin_latest_only = LatestOnlyOperator( task_id='campaign_fin_latest_only', dag=dag) #: Get 460A transactions schedule_460A = PythonOperator(task_id='get_transactions_a', python_callable=get_transactions_a, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Get 460B1 transactions schedule_460B1 = PythonOperator(task_id='get_transactions_b', python_callable=get_transactions_b, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
def test_run(self): task = LatestOnlyOperator( task_id='latest', dag=self.dag) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
# This might need some refactoring (filenameing dates) flist = { 'full': 'treas_parking_payments_{}_datasd_v1.csv'.format(cur_yr), 'by_month': 'treas_meters_{}_pole_by_month_datasd_v1.csv'.format(cur_yr), 'by_day': 'treas_meters_{}_pole_by_mo_day_datasd_v1.csv'.format(cur_yr) } dag = DAG( dag_id='parking_meters', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for parking meters parking_meters_latest_only = LatestOnlyOperator( task_id='parking_meters_latest_only', dag=dag) #: Downloads all parking files from FTP get_parking_files = BashOperator( task_id='get_parking_files', bash_command=ftp_download_wget(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Joins downloaded files from ftp to production build_prod_file = PythonOperator( task_id='build_prod_file', python_callable=build_prod_file,
args = general.args conf = general.config schedule = general.schedule['budget'] start_date = general.start_date['budget'] budget_fy = general.get_FY_short() + 1 dag = DAG( dag_id='budget', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for budget budget_latest_only = LatestOnlyOperator( task_id='budget_latest_only', dag=dag) get_accounts = PythonOperator( task_id='get_chart_of_accounts', python_callable=get_accounts_chart, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) get_capital_ptd = PythonOperator( task_id='get_capital_ptd', python_callable=get_capital_ptd, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
# All times in Airflow UTC. Set Start Time in PST? args = general.args conf = general.config schedule = general.schedule['documentum_hr_30'] start_date = general.start_date['documentum_hr_30'] #: Dag spec dag = DAG(dag_id='documentum_hourly_30', catchup=False, default_args=args, start_date=start_date, schedule_interval=schedule) prod_data = conf['prod_data_dir'] schedule_mode = 'schedule_hourly_30' documentum_docs_latest_only = LatestOnlyOperator( task_id='documentum_others_docs_latest_only', dag=dag) #: Get documentum tables get_doc_tables = PythonOperator(task_id='get_documentum_tables', python_callable=get_documentum, op_kwargs={'mode': schedule_mode}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) div_doc_table = PythonOperator(task_id='divide_doc_table', python_callable=latest_res_ords, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
#get_stale_cmd = "python /data-portal-monitoring/late_updated_datasets.py" get_stale_cmd = BASEPYTHON + BASEDIR + "late_updated_datasets.py" t4 = BashOperator(task_id='stale_delayed_datasets', bash_command=get_stale_cmd, dag=dag) #digest = SubDagOperator( # subdag=dag2, # task_id= 'data_monitoring_workflow_dag.digest_dag', # dag=dag, #) #dag >> t1 #>> t2 >> t3 >> t4 latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag) t1.set_upstream(latest_only) t2.set_upstream(t1) t3.set_upstream(t1) t4.set_upstream(t1) #t1 >> digest #run thje digest every 12 hours dag2 = DAG( dag_id='data_monitoring_late_updated_digest_dag', default_args=WORKFLOW_DEFAULT_ARGS, start_date=WORKFLOW_START_DATE, schedule_interval='30 */12 * * *', ) #stale_delayed_datasets_digest_cmd = "python2 /Users/j9/Desktop/data-portal-monitoring/digest_late_updated_datasets.py"
from trident.util import general from trident.util.notifications import notify from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag, update_json_date args = general.args conf = general.config schedule = general.schedule['traffic_counts'] start_date = general.start_date['traffic_counts'] dag = DAG(dag_id='traffic_counts', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for traffic_counts tc_latest_only = LatestOnlyOperator(task_id='traffic_counts_latest_only', dag=dag) #: Downloads traffic counts xlsx from share get_traffic_counts = PythonOperator(task_id='get_traffic_counts', python_callable=get_traffic_counts, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Cleans the downloaded XLSX file, converts it to CSV data. clean_traffic_counts = PythonOperator(task_id='clean_traffic_counts', python_callable=clean_traffic_counts, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag conf = general.config args = general.args schedule = general.schedule['dsd_approvals'] start_date = general.start_date['dsd_approvals'] year = general.get_year() #: Dag spec for dsd permits dag = DAG(dag_id='dsd_permits', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for dsd permits. dsd_permits_latest_only = LatestOnlyOperator( task_id='dsd_permits_latest_only', dag=dag) #: Get permits reports get_permits_files = BashOperator( task_id='get_permits_files', bash_command=get_permits_files(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Clean permits reports clean_data = PythonOperator( task_id='clean_data', python_callable=clean_data, on_failure_callback=notify,
from dags.public_art.public_art_jobs import * from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag, update_json_date # All times in Airflow UTC. Set Start Time in PST? args = general.args conf = general.config schedule = general.schedule['public_art'] start_date = general.start_date['public_art'] #: Dag spec dag = DAG(dag_id='public_art', default_args=args, start_date=start_date, schedule_interval=schedule) public_art_latest_only = LatestOnlyOperator(task_id='public_art_latest_only', dag=dag) #: Get public art from NetX, process, output prod file get_public_art = PythonOperator(task_id='get_public_art', python_callable=get_public_art, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) process_public_art = PythonOperator(task_id='process_public_art', python_callable=process_public_art, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag)
def test_run(self): task = LatestOnlyOperator( task_id='latest', dag=self.dag) task.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)
from dags.tsw_integration.tsw_integration_jobs import * # All times in Airflow UTC. Set Start Time in PST? args = general.args conf = general.config schedule = general.schedule['tsw_integration'] start_date = general.start_date['tsw_integration'] #: Dag spec dag = DAG(dag_id='tsw_integration', default_args=args, start_date=start_date, schedule_interval=schedule) violations_latest_only = LatestOnlyOperator(task_id='violations_latest_only', dag=dag) # VPM Extraction Support Tasks #: Download VPM dump from FTP get_vpm_violations = BashOperator(task_id='get_vpm_violations', bash_command=get_vpm_violations_wget(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Download VPM dump from FTP #get_vpm_dump = BashOperator( # task_id='get_vpm_dump', # bash_command=ftp_download_wget(),
main_task = PythonOperator( task_id=dag_id, python_callable=wrapper(operator, dag_id), op_args=[pipeline['name'], pipeline['params'], pipeline], dag=dag) if description and (match := depends_on.match(description)): parent_dag_id = match.group(1) t0 = ExternalTaskSensor(task_id=dag_id + '__trigger', external_dag_id=parent_dag_id, external_task_id=parent_dag_id, mode='reschedule', dag=dag) t0 >> main_task elif schedule is not None: t0 = LatestOnlyOperator(task_id=dag_id + '__latest_only', dag=dag) t0 >> main_task globals()[dag_id] = dag except Exception as e: logging.error( f'Failed to create a DAG with id {dag_id}, schedule {schedule} because {e}' ) task_id = '_clean_scheduler_logs' dag_id = task_id + '_dag' schedule = '0 * * * *' args = { 'owner': 'Airflow', 'depends_on_past': False, 'start_date': datetime.datetime.now(), 'is_paused_upon_creation': False,
from trident.util.seaboard_updates import * from dags.streets.streets_jobs import * # All times in Airflow UTC. Set Start Time in PST? args = general.args conf = general.config schedule = general.schedule['streets'] start_date = general.start_date['streets'] #: Dag spec dag = DAG(dag_id='streets', default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for imcat streets_latest_only = LatestOnlyOperator(task_id='streets_latest_only', dag=dag) #: Get streets data from DB get_streets_data = PythonOperator( task_id='get_streets_paving_data', python_callable=get_streets_paving_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Process data for public process_data_sdif = PythonOperator( task_id='process_sdif', python_callable=process_paving_data, op_kwargs={'mode': 'sdif'},
'email_on_retry': True, 'retries': 3, 'retry_delay': timedelta(minutes=15), } DASHBOARD_DAG_ID = 'dashboard_aggregation' dashboard_dag = DAG(DASHBOARD_DAG_ID, default_args=default_args, schedule_interval='0 18 * * *') latest_only = LatestOnlyOperator(task_id='latest_only', dag=dashboard_dag, depends_on_past=True) prev_month = SubDagOperator(subdag=monthly_subdag( DASHBOARD_DAG_ID, 'prev_month', dashboard_dag.default_args, dashboard_dag.schedule_interval, interval=-1), task_id='prev_month', dag=dashboard_dag) current_month = SubDagOperator(subdag=monthly_subdag( DASHBOARD_DAG_ID, 'current_month', dashboard_dag.default_args,
from trident.util.notifications import notify from dags.inventory.inv_jobs import * conf = general.config args = general.args schedule = general.schedule['inventory'] start_date = general.start_date['inventory'] dag = DAG(dag_id='inventory', default_args=args, start_date=start_date, schedule_interval=schedule) inv_latest_only = LatestOnlyOperator(task_id='inventory_latest_only', dag=dag) #: Inventory Doc To CSV inventory_to_csv = PythonOperator(task_id='inventory_to_csv', python_callable=inventory_to_csv, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload Inventory CSV to S3 upload_inventory = S3FileTransferOperator( task_id='upload_inventory', source_base_path=conf['prod_data_dir'], source_key='inventory_datasd_v1.csv', dest_s3_conn_id=conf['default_s3_conn_id'],
continue try: response = requests.get(QUEUE_URL.format(queue_name), auth=(QUEUE_USERNAME, QUEUE_PASSWORD)) stats = json.loads(response.text) size = stats['messages_ready'] + stats['messages_unacknowledged'] queue_sizes[queue_name] = size except Exception: logger.exception('No tasks found for %s', queue_name) queue_sizes[queue_name] = 0 return queue_sizes latest = LatestOnlyOperator(task_id='latest_only', queue='manager', dag=dag) queue_sizes_task = PythonOperator(task_id=QUEUE_SIZES_TASK_ID, python_callable=get_queue_sizes, queue="manager", dag=dag) rescale_task = BashOperator(task_id=RESCALE_TASK_ID, bash_command=templated_resize_command, queue="manager", params={'task_id': QUEUE_SIZES_TASK_ID}, dag=dag) latest.set_downstream(queue_sizes_task) queue_sizes_task.set_downstream(rescale_task)
def test_not_skipping_external(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) self.dag.create_dagrun( run_id="manual__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, external_trigger=True, ) self.dag.create_dagrun( run_id="manual__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING, external_trigger=True, ) self.dag.create_dagrun( run_id="manual__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING, external_trigger=True, ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
from trident.util import general from trident.util.notifications import notify from trident.util.seaboard_updates import update_seaboard_date, get_seaboard_update_dag, update_json_date args = general.args conf = general.config schedule = general.schedule start_date = general.start_date['pd_col'] dag = DAG(dag_id='pd_col', default_args=args, start_date=start_date, schedule_interval=schedule['pd_col']) #: Latest Only Operator for pd_col pd_col_latest_only = LatestOnlyOperator(task_id='pd_col_latest_only', dag=dag) #: Get collisions data from FTP and save to temp folder get_collisions_data = PythonOperator(task_id='get_collisions_data', python_callable=get_collisions_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Process collisions data and save result to prod folder process_collisions_data = PythonOperator( task_id='process_collisions_data', python_callable=process_collisions_data, on_failure_callback=notify, on_retry_callback=notify,
def test_skipping_non_latest(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task3 = DummyOperator( task_id='downstream_3', trigger_rule=TriggerRule.NONE_FAILED, dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) downstream_task3.set_upstream(downstream_task) self.dag.create_dagrun( run_id="scheduled__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING, ) self.dag.create_dagrun( run_id="scheduled__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING, ) self.dag.create_dagrun( run_id="scheduled__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING, ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task3.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): None, timezone.datetime(2016, 1, 1, 12): None, timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_3') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
import glob args = general.args conf = general.config schedule = general.schedule start_date = general.start_date['claims_stat'] email_recips = conf['mail_notify_claims'] #: Dag definition dag = DAG(dag_id='claims_stat', default_args=args, start_date=start_date, schedule_interval=schedule['claims_stat']) #: Latest Only Operator for claims claims_stat_latest_only = LatestOnlyOperator(task_id='claims_stat_latest_only', dag=dag) #: Pull claims data from oracle get_claims_data = PythonOperator(task_id='get_claims_data', python_callable=get_claims_data, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload clean and geocode claims data clean_geocode = PythonOperator(task_id='clean_geocode_claims', python_callable=clean_geocode_claims, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ Example LatestOnlyOperator and TriggerRule interactions """ import datetime as dt import airflow from airflow.models import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.latest_only_operator import LatestOnlyOperator from airflow.utils.trigger_rule import TriggerRule dag = DAG( dag_id='latest_only_with_trigger', schedule_interval=dt.timedelta(hours=4), start_date=airflow.utils.dates.days_ago(2), ) latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag) task1 = DummyOperator(task_id='task1', dag=dag) task2 = DummyOperator(task_id='task2', dag=dag) task3 = DummyOperator(task_id='task3', dag=dag) task4 = DummyOperator(task_id='task4', dag=dag, trigger_rule=TriggerRule.ALL_DONE) latest_only >> task1 >> [task3, task4] task2 >> [task3, task4]
def test_skipping_dagrun(self): latest_task = LatestOnlyOperator( task_id='latest', dag=self.dag) downstream_task = DummyOperator( task_id='downstream', dag=self.dag) downstream_task2 = DummyOperator( task_id='downstream_2', dag=self.dag) downstream_task.set_upstream(latest_task) downstream_task2.set_upstream(downstream_task) self.dag.create_dagrun( run_id="manual__1", start_date=timezone.utcnow(), execution_date=DEFAULT_DATE, state=State.RUNNING ) self.dag.create_dagrun( run_id="manual__2", start_date=timezone.utcnow(), execution_date=timezone.datetime(2016, 1, 1, 12), state=State.RUNNING ) self.dag.create_dagrun( run_id="manual__3", start_date=timezone.utcnow(), execution_date=END_DATE, state=State.RUNNING ) latest_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task.run(start_date=DEFAULT_DATE, end_date=END_DATE) downstream_task2.run(start_date=DEFAULT_DATE, end_date=END_DATE) latest_instances = get_task_instances('latest') exec_date_to_latest_state = { ti.execution_date: ti.state for ti in latest_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'success', timezone.datetime(2016, 1, 1, 12): 'success', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_latest_state) downstream_instances = get_task_instances('downstream') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state) downstream_instances = get_task_instances('downstream_2') exec_date_to_downstream_state = { ti.execution_date: ti.state for ti in downstream_instances} self.assertEqual({ timezone.datetime(2016, 1, 1): 'skipped', timezone.datetime(2016, 1, 1, 12): 'skipped', timezone.datetime(2016, 1, 2): 'success'}, exec_date_to_downstream_state)
def create_new_dags(dag_id, database): def f_print_log(): return '{} start processing tables in database: {}'.format( dag_id, database) def f_check_table_exists(table_name): connect = PostgresHook(postgres_conn_id=database) query = """ select count(1) from information_schema.tables where table_schema not like %s and table_name = %s """ res = connect.get_first(query, parameters=('', table_name)) if res[0] == 0: return 'create_table' else: return 'table_exists' def f_create_table(table_name): connect = PostgresHook(postgres_conn_id=database) query = """ create table {}( id integer not null, "user" varchar(50) not null, timestamp timestamp not null )""".format(table_name) connect.run(query) def f_insert_row(table_name, **context): connect = PostgresHook(postgres_conn_id=database) user = context['ti'].xcom_pull(task_ids='get_current_user', key='return_value') # do not do this in production, sql injection is possible query = """ insert into {} values(%s, %s, %s) """.format(table_name) connect.run(query, parameters=(uuid.uuid4().int % 123456789, user, datetime.now())) # for every table replicate records for key in config: table_name = config[key].get('table_name') name = '{}_table_{}'.format(dag_id, table_name) with DAG(name, schedule_interval=config[key].get('schedule_interval'), start_date=config[key].get('start_date')) as dag: # ignore the previous tasks, no backfilling latest_only = LatestOnlyOperator(task_id='latest_only', dag=dag) # logging the dag print_the_context = PythonOperator(task_id='print_the_context', python_callable=f_print_log, dag=dag) # write the current username to xcom get_current_user = BashOperator(task_id='get_current_user', bash_command='whoami', xcom_push=True, dag=dag) # check table exists check_table_exists = BranchPythonOperator( task_id='check_table_exists', python_callable=f_check_table_exists, op_args=[table_name], dag=dag) # create table create_table = PythonOperator(task_id='create_table', python_callable=f_create_table, op_args=[table_name], dag=dag) # skip table generation table_exists = DummyOperator(task_id='table_exists', dag=dag) # insert to a table insert_new_rows = PythonOperator( task_id='insert_new_rows', python_callable=f_insert_row, op_kwargs={'table_name': table_name}, provide_context=True, dag=dag, trigger_rule='none_failed') # query a table query_the_table = PostgreSQLCountRows(task_id='query_the_table', table_name=table_name, connection_id=database, dag=dag) latest_only >> print_the_context >> get_current_user >> check_table_exists >> ( create_table, table_exists) >> insert_new_rows >> query_the_table yield name, dag
args = general.args conf = general.config schedule = general.schedule['gis_tree_canopy'] start_date = general.start_date['gis_tree_canopy'] folder = 'trees' layer = 'tree_canopy' datasd_name = 'tree_canopy_datasd' path_to_file = conf['prod_data_dir'] + '/' + datasd_name dag = DAG(dag_id='gis_{layer}'.format(layer=layer), default_args=args, start_date=start_date, schedule_interval=schedule) #: Latest Only Operator for sdif treecan_latest_only = LatestOnlyOperator(task_id='tree_canopy_latest_only', dag=dag) #: Get tree canopy shapefile from Atlas get_shapefiles = PythonOperator(task_id='get_tree_canopy_gis', python_callable=sde_to_shp, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert shp to geojson shp_to_geojson = BashOperator(task_id='tree_canopy_to_geojson', bash_command=shp_to_geojson(), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify,