def test_duplicate_group_id(): from airflow.exceptions import DuplicateTaskIdFound execution_date = pendulum.parse("20200101") with pytest.raises(DuplicateTaskIdFound, match=r".* 'task1' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): _ = DummyOperator(task_id="task1") with TaskGroup("task1"): pass with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): _ = DummyOperator(task_id="task1") with TaskGroup("group1", prefix_group_id=False): with TaskGroup("group1"): pass with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): with TaskGroup("group1", prefix_group_id=False): _ = DummyOperator(task_id="group1") with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1.downstream_join_id' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): _ = DummyOperator(task_id="task1") with TaskGroup("group1"): _ = DummyOperator(task_id="downstream_join_id") with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1.upstream_join_id' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): _ = DummyOperator(task_id="task1") with TaskGroup("group1"): _ = DummyOperator(task_id="upstream_join_id")
def get_benchmarks(self): benchmarks = self._get_benchmarks(self.vars["benchmarks"]) with TaskGroup("Index Results", prefix_group_id=False, dag=self.dag) as post_steps: indexers = self._add_indexers(benchmarks) return benchmarks
# under the License. """Example DAG demonstrating the usage of the TaskGroup.""" from airflow.models.dag import DAG from airflow.operators.bash import BashOperator from airflow.operators.dummy import DummyOperator from airflow.utils.dates import days_ago from airflow.utils.task_group import TaskGroup # [START howto_task_group] with DAG(dag_id="example_task_group", start_date=days_ago(2), tags=["example"]) as dag: start = DummyOperator(task_id="start") # [START howto_task_group_section_1] with TaskGroup("section_1", tooltip="Tasks for section_1") as section_1: task_1 = DummyOperator(task_id="task_1") task_2 = BashOperator(task_id="task_2", bash_command='echo 1') task_3 = DummyOperator(task_id="task_3") task_1 >> [task_2, task_3] # [END howto_task_group_section_1] # [START howto_task_group_section_2] with TaskGroup("section_2", tooltip="Tasks for section_2") as section_2: task_1 = DummyOperator(task_id="task_1") # [START howto_task_group_inner_section_2] with TaskGroup("inner_section_2", tooltip="Tasks for inner_section2") as inner_section_2: task_2 = BashOperator(task_id="task_2", bash_command='echo 1')
def taskflow_dag(): # Update replicator tables # This task group will take the new tables created by the on-prem replicator under the schema TRAFFIC_NEW # and alter the schema to TRAFFIC_INTER and then create the materialized view TRAFFIC on top # but it will only do this is the TRAFFIC_NEW table exists (ie. a new dataset was written) with TaskGroup( group_id="replicator_update_schema") as replicator_update_schema: group_id = "replicator_update_schema" ACC = create_bash_task_nested(group_id, 'acc') ARC_LINK = create_bash_task_nested(group_id, 'arc_link') ARTERYDATA = create_bash_task_nested(group_id, 'arterydata') CATEGORY = create_bash_task_nested(group_id, "category") CNT_DET = create_bash_task_nested(group_id, 'cnt_det') CNT_SPD = create_bash_task_nested(group_id, 'cnt_spd') COUTNINFO = create_bash_task_nested(group_id, 'countinfo') COUNTINFOMICS = create_bash_task_nested(group_id, 'countinfomics') DET = create_bash_task_nested(group_id, 'det') NODE = create_bash_task_nested(group_id, 'node') ACC >> ARC_LINK ARC_LINK >> ARTERYDATA ARTERYDATA >> CATEGORY CATEGORY >> CNT_DET CNT_DET >> CNT_SPD CNT_SPD >> COUTNINFO COUTNINFO >> COUNTINFOMICS COUNTINFOMICS >> DET DET >> NODE NODE # GCC's ArcGIS REST API server exposes a series of "services", each with a name like # `cot_geospatial2`. Within those services, individual layers have an ID # (in parentheses, after the layer name). with TaskGroup(group_id="copy_gis_layers") as copy_gis_layers: TASKS = { 'bikeway': ('cot_geospatial2', 2), 'accessible_signal': ('cot_geospatial2', 4), 'pedestrian_crossover': ('cot_geospatial2', 7), 'traffic_signal': ('cot_geospatial2', 9), 'hospital': ('cot_geospatial10', 21), 'toinview_program_point': ('cot_geospatial12', 46), 'toinview_program_line': ('cot_geospatial12', 47), 'toinview_program_polygon': ('cot_geospatial12', 48), 'school': ('cot_geospatial28', 17) } for task_id, task_args in TASKS.items(): mapserver_name, layer_id = task_args params = {'mapserver_name': mapserver_name, 'layer_id': layer_id} bash_task = BashOperator(task_id=task_id, bash_command='/copy_gis_layer.sh', params=params) bash_task # The Open Data Portal (i.e. CKAN) stores resources at URLs of format # `${BASE_URL}/dataset/${DATASET_ID}/resource/${RESOURCE_ID}/download/${FILENAME}`. # # To find these resource URLs: # # - find the dataset in the Open Data Portal (for instance, the Toronto Centreline # is at https://open.toronto.ca/dataset/toronto-centreline-tcl/); # - open the "For Developers" tab in the carousel; # - find the dataset ID listed in `params`; # - use this to request `${BASE_URL}/action/package_show?id=${DATASET_ID}`; # - in there, look for the URL under `result.resources[].url`. with TaskGroup( group_id="copy_opendata_shapefiles") as copy_opendata_shapefiles: group_id = "copy_opendata_shapefiles" TASK_GROUP = { 'centreline': { 'resource_url': 'https://ckanadmin0.intra.prod-toronto.ca/dataset/1d079757-377b-4564-82df-eb5638583bfb/resource/7209841e-e59c-49e4-9205-3b0587f2eea9/download/centreline_wgs84_v2.zip', 'source_srid': 3857 }, 'centreline_intersection': { 'resource_url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2c83f641-7808-49ba-b80f-7011851d4e27/resource/c2fc0db0-7dcd-4c13-a54c-f39debc441bd/download/intersection-file-wgs84.zip', 'source_srid': 4326 } } for task_id, params in TASK_GROUP.items(): task_id_extract = '{0}_extract'.format(task_id) with TaskGroup(group_id=f'{task_id}'): INDEX_OPENDATA = BashOperator( task_id='index_opendata', bash_command='/copy_opendata_shapefiles/index_opendata.sh'. format(task_id=task_id)) EXTRACT_OPENDATA_SHAPEFILE = BashOperator( task_id=task_id_extract, bash_command= '/copy_opendata_shapefiles/extract_opendata_shapefile.sh', params={ 'name': task_id, 'resource_url': params['resource_url'] }) task_id_load = '{0}_load'.format(task_id) LOAD_SHAPEFILE = BashOperator( task_id=task_id_load, bash_command='/copy_opendata_shapefiles/load_shapefile.sh', params={ 'name': task_id, 'source_srid': params['source_srid'] }) EXTRACT_OPENDATA_SHAPEFILE >> LOAD_SHAPEFILE >> INDEX_OPENDATA # centreline_conflation_target # # Normalize the Toronto Centreline into common _conflation target_ and _routing target_ # views, for use by other pipelines. # # The conflation target consists of two views `centreline.midblocks`, `centreline.intersections`. # The midblocks and intersections in these views are shown on MOVE's map. When we conflate # collisions and traffic studies to the centreline, we only conflate those to centreline features # that are in this conflation target. # # The routing target consists of two views `centreline.routing_vertices`, `centreline.routing_edges` # and is a superset of the conflation target. This exists because the conflation target is not a # valid graph (in the graph theory sense); some midblock endpoints refer to intersection IDs that do # not correspond to actual intersections. To fix this, the routing target fills in vertices for # those intersection IDs. When routing corridors between centreline features, we use the routing # target, then filter the result down to only those features in the conflation target. # # This is intended to run after `copy_opendata_shapefiles`. with TaskGroup(group_id="centreline_conflation_target" ) as centreline_conflation_target: group_id = "centreline_conflation_target" A0_INTERSECTIONS_BASE = create_bash_task_nested( group_id, 'A0_intersections_base') A0_MIDBLOCKS_BASE = create_bash_task_nested(group_id, 'A0_midblocks_base') A1_INTERSECTION_IDS = create_bash_task_nested(group_id, 'A1_intersection_ids') A2_INTERSECTIONS = create_bash_task_nested(group_id, "A2_intersections") A3_MIDBLOCK_NAMES = create_bash_task_nested(group_id, 'A3_midblock_names') A4_MIDBLOCKS = create_bash_task_nested(group_id, 'A4_midblocks') A5_ROUTING_VERTICES = create_bash_task_nested(group_id, 'A5_routing_vertices') A6_ROUTING_EDGES = create_bash_task_nested(group_id, 'A6_routing_edges') [A0_INTERSECTIONS_BASE, A0_MIDBLOCKS_BASE] >> A1_INTERSECTION_IDS A1_INTERSECTION_IDS >> A2_INTERSECTIONS A2_INTERSECTIONS >> A3_MIDBLOCK_NAMES A3_MIDBLOCK_NAMES >> A4_MIDBLOCKS A4_MIDBLOCKS >> A5_ROUTING_VERTICES A5_ROUTING_VERTICES >> A6_ROUTING_EDGES # """ # gis_layers_vector_tiles # # Generates vector tiles from GIS layers provided by GCC, which are loaded into our database by # the `copy_gis_layers` DAG. These are stored in `/data/tiles`, and are served from `/tiles` on # our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render information # about schools, hospitals, and other points of interest when zoomed in. # # This is intended to run after `copy_gis_layers`. # """ with TaskGroup( group_id="gis_layers_vector_tiles") as gis_layers_vector_tiles: BUILD_GIS_LAYERS_TILES = create_bash_task('build_gis_layers_tiles') EXTRACT_GIS_LAYERS_TILES = create_bash_task('extract_gis_layers_tiles') BUILD_GIS_LAYERS_TILES >> EXTRACT_GIS_LAYERS_TILES """ location_search_index Builds the views and indexes that support location search, and also builds an index of midblock names. This is intended to run after `centreline_conflation_target` and `copy_gis_layers` """ with TaskGroup(group_id="location_search_index") as location_search_index: group_id = "location_search_index" TRANSFORM_CENTRELINE_INDEX = create_bash_task_nested( group_id, 'transform_centreline_index') TRANSFORM_INTERSECTIONS_INDEX = create_bash_task_nested( group_id, 'transform_intersections_index') TRANSFORM_TRAFFIC_SIGNAL = create_bash_task_nested( group_id, 'transform_traffic_signal') TRANSFORM_TRAFFIC_SIGNAL TRANSFORM_CENTRELINE_INDEX >> TRANSFORM_INTERSECTIONS_INDEX """ centreline_vector_tiles Generates vector tiles from the MOVE conflation target, which is built by the `centreline_conflation_target` DAG. These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render interactive centreline features. This is intended to run after `centreline_conflation_target`. """ with TaskGroup( group_id='centreline_vector_tiles') as centreline_vector_tiles: group_id = 'centreline_vector_tiles' LOAD_VOLUME = create_bash_task_nested(group_id, 'load_volume') BUILD_VECTOR_TILES = create_bash_task_nested(group_id, 'build_vector_tiles') EXTRACT_VECTOR_TILES = create_bash_task_nested(group_id, 'extract_vector_tiles') LOAD_VOLUME >> BUILD_VECTOR_TILES BUILD_VECTOR_TILES >> EXTRACT_VECTOR_TILES """ arteries_geocoding Uses arterycode matching information and processes as originally developed by Data + Analytics to link counts with the Toronto centreline. The legacy FLOW system was not based on the Toronto Centreline, but rather used a legacy map layer that is no longer supported. In FLOW, arterycodes identified locations in that legacy map layer. To use these with the Toronto Centreline, we apply a series of heuristics developed by Data + Analytics: ID matching on `LINKID`, spatial matches, etc. This is the first step in our FLOW geocoding cascade, which continues with the DAGs `group_multidirection_arteries` and `group_multiday_counts`. All three DAGs must run before MOVE is considered to have updated its copy of FLOW data. This is intended to run after `replicator_transfer_flow` and `centreline_conflation_target`. """ with TaskGroup(group_id="arteries_geocoding") as arteries_geocoding: group_id = "arteries_geocoding" A1_ARTERIES_MANUAL_CORR = create_bash_task_nested( group_id, 'A1_arteries_manual_corr') A1_NODES_CORRECTED = create_bash_task_nested(group_id, 'A1_nodes_corrected') A2_NODES_CENTRELINE = create_bash_task_nested(group_id, 'A2_nodes_centreline') B1_ARTERIES_PX_CENTRELINE = create_bash_task_nested( group_id, 'B1_arteries_px_centreline') B2_ARTERIES_MANUAL_CORR_NORMALIZED = create_bash_task_nested( group_id, 'B2_arteries_manual_corr_normalized') C1_ARTERIES_LINKS = create_bash_task_nested(group_id, 'C1_arteries_links') C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS = create_bash_task_nested( group_id, 'C2_arteries_double_link_midblocks') C2_ARTERIES_DOUBLE_NODE = create_bash_task_nested( group_id, 'C2_arteries_double_node') C2_ARTERIES_SINGLE_NODE = create_bash_task_nested( group_id, 'C2_arteries_single_node') C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS = create_bash_task_nested( group_id, 'C3_arteries_double_node_midblocks') C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST = create_bash_task_nested( group_id, 'C4_arteries_double_node_midblocks_multi_best') D1_ARTERIES_CENTRELINE_TABLE = create_bash_task_nested( group_id, 'D1_arteries_centreline_table') D2_ARTERY_GEOCODING = create_bash_task_nested(group_id, 'D2_artery_geocoding') D3_ARTERIES_CENTRELINE_VIEW = create_bash_task_nested( group_id, 'D3_arteries_centreline_view') A1_NODES_CORRECTED >> A2_NODES_CENTRELINE A1_ARTERIES_MANUAL_CORR >> B2_ARTERIES_MANUAL_CORR_NORMALIZED A2_NODES_CENTRELINE >> C2_ARTERIES_DOUBLE_NODE C1_ARTERIES_LINKS >> C2_ARTERIES_DOUBLE_NODE A2_NODES_CENTRELINE >> C2_ARTERIES_SINGLE_NODE C1_ARTERIES_LINKS >> C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS C1_ARTERIES_LINKS >> C2_ARTERIES_SINGLE_NODE C2_ARTERIES_DOUBLE_NODE >> C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS >> C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST A2_NODES_CENTRELINE >> D1_ARTERIES_CENTRELINE_TABLE B1_ARTERIES_PX_CENTRELINE >> D1_ARTERIES_CENTRELINE_TABLE B2_ARTERIES_MANUAL_CORR_NORMALIZED >> D1_ARTERIES_CENTRELINE_TABLE C1_ARTERIES_LINKS >> D1_ARTERIES_CENTRELINE_TABLE C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS >> D1_ARTERIES_CENTRELINE_TABLE C2_ARTERIES_SINGLE_NODE >> D1_ARTERIES_CENTRELINE_TABLE C2_ARTERIES_DOUBLE_NODE >> D1_ARTERIES_CENTRELINE_TABLE C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS >> D1_ARTERIES_CENTRELINE_TABLE C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST >> D1_ARTERIES_CENTRELINE_TABLE D1_ARTERIES_CENTRELINE_TABLE >> D2_ARTERY_GEOCODING D2_ARTERY_GEOCODING >> D3_ARTERIES_CENTRELINE_VIEW """ crash_geocoding Normalizes CRASH data into collision _events_ and collision _involved persons_, then matches collision events to the centreline conflation target that was created by `centreline_conflation_target`. Our legacy schema in Oracle stores both event-related and involved-person-related information in a single table, `TRAFFIC.ACC`. That table has one record per involved person, with event-level details copied across all persons involved in a collision. To make this easier to work with in MOVE, we transform `TRAFFIC.ACC` into a normalized representation. To match collisions to the centreline, we use the following heuristic: - if there are any intersections within 20m, match to the closest such intersection; - otherwise, if there are any midblocks within 20m, match to the closest such midblock; - otherwise, do not match. This same heuristic was used by the legacy CRASH system to assign collisions to intersections and midblocks. (However, CRASH did not use the Toronto Centreline, but instead used a legacy map layer that has been deprecated and is no longer maintained by the City.) This is intended to run after `replicator_transfer_crash` and `centreline_conflation_target`. """ with TaskGroup(group_id="crash_geocoding") as crash_geocoding: group_id = 'crash_geocoding' A1_EVENTS_FIELDS_RAW = create_bash_task_nested(group_id, 'A1_events_fields_raw') A2_EVENTS_FIELDS_NORM = create_bash_task_nested( group_id, 'A2_events_fields_norm') A2_INVOLVED_FIELDS_RAW = create_bash_task_nested( group_id, 'A2_involved_fields_raw') A3_INVOLVED_FIELDS_NORM = create_bash_task_nested( group_id, 'A3_involved_fields_norm') A4_INVOLVED = create_bash_task_nested(group_id, 'A4_involved') A5_EVENTS = create_bash_task_nested(group_id, 'A5_events') A6_EVENTS_INTERSECTIONS = create_bash_task_nested( group_id, 'A6_events_intersections') A6_EVENTS_SEGMENTS = create_bash_task_nested(group_id, 'A6_events_segments') A7_EVENTS_CENTRELINE = create_bash_task_nested(group_id, 'A7_events_centreline') A1_EVENTS_FIELDS_RAW >> A2_EVENTS_FIELDS_NORM A1_EVENTS_FIELDS_RAW >> A2_INVOLVED_FIELDS_RAW A2_EVENTS_FIELDS_NORM >> A3_INVOLVED_FIELDS_NORM A2_INVOLVED_FIELDS_RAW >> A3_INVOLVED_FIELDS_NORM A3_INVOLVED_FIELDS_NORM >> A4_INVOLVED A4_INVOLVED >> A5_EVENTS A5_EVENTS >> A6_EVENTS_INTERSECTIONS A5_EVENTS >> A6_EVENTS_SEGMENTS A6_EVENTS_INTERSECTIONS >> A7_EVENTS_CENTRELINE A6_EVENTS_SEGMENTS >> A7_EVENTS_CENTRELINE """ collisions_vector_tiles Generates vector tiles from collisions data, which is built by the `crash_geocoding` DAG. These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render collisions heatmaps when zoomed out. This is intended to run after `crash_geocoding`. """ with TaskGroup( group_id="collisions_vector_tiles") as collisions_vector_tiles: group_id = "collisions_vector_tiles" BUILD_COLLISIONS_TILES = create_bash_task_nested( group_id, 'build_collisions_tiles') EXTRACT_COLLISIONS_TILES = create_bash_task_nested( group_id, 'extract_collisions_tiles') BUILD_COLLISIONS_TILES >> EXTRACT_COLLISIONS_TILES """ group_multidirection_arteries Continues the FLOW geocoding process started by `arteries_geocoding`, by identifying arterycodes that refer to different directions of travel in the same location and grouping them together. When a traffic study is requested, it might ask for 3 days of data collection on a 2-way street; someone requesting this study would want to see all 3 days in both directions of travel. However, the legacy FLOW schema uses separate arterycodes for different directions of travel, and also uses separate `COUNT_INFO_ID`s for each day of a traffic study. As a first step towards delivering all data for this study at once, we need to identify the arterycodes that correspond to these two directions of travel, and group them together. Once that's done, the DAG `group_multiday_counts` then takes care of grouping together the 3 days of the traffic study, so that we can get all six relevant counts in database. Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no reliable way to visualize that much data at once. This is intended to run after `arteries_geocoding`. """ with TaskGroup(group_id='group_multidirection_arteries' ) as group_multidirection_arteries: group_id = 'group_multidirection_arteries' A1_ARTERIES_DOUBLE_LINK_PAIRS = create_bash_task_nested( group_id, 'A1_arteries_double_link_pairs') A1_ARTERIES_MIDBLOCK_SOLO = create_bash_task_nested( group_id, 'A1_arteries_midblock_solo') A2_ARTERIES_GROUPS_PRE = create_bash_task_nested( group_id, 'A2_arteries_groups_pre') A3_ARTERIES_GROUPS_RANKED = create_bash_task_nested( group_id, 'A3_arteries_groups_ranked') A4_ARTERIES_GROUPS_POST = create_bash_task_nested( group_id, 'A4_arteries_groups_post') A1_ARTERIES_DOUBLE_LINK_PAIRS >> A2_ARTERIES_GROUPS_PRE A1_ARTERIES_MIDBLOCK_SOLO >> A2_ARTERIES_GROUPS_PRE A2_ARTERIES_GROUPS_PRE >> A3_ARTERIES_GROUPS_RANKED A3_ARTERIES_GROUPS_RANKED >> A4_ARTERIES_GROUPS_POST """ group_multiday_counts Finishes the FLOW geocoding process started by `arteries_geocoding` and continued by `group_multidirection_arteries`, by identifying consecutive days of data collection from the same arterycode group and grouping those together into a single study. When a traffic study is requested, it might ask for 3 days of data collection on a 2-way street; someone requesting this study would want to see all 3 days in both directions of travel. However, the legacy FLOW schema uses separate arterycodes for different directions of travel, and also uses separate `COUNT_INFO_ID`s for each day of a traffic study. Once `group_multidirection_arteries` has completed, we've identified the arterycodes that correspond to these two directions of travel. To find all data for the study, we now need to group together the 3 days over which data was collected at these two arterycodes. However, not all studies are of the same duration. To detect studies, we use runs of consecutive days at the same arterycode group. Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no reliable way to visualize that much data at once. This is intended to run after `group_multidirection_arteries`. """ with TaskGroup(group_id='group_multiday_counts') as group_multiday_counts: group_id = 'group_multiday_counts' A1_COUNTS_MULTIDAY_RUNS = create_bash_task_nested( group_id, 'A1_counts_multiday_runs') A2_ARTERIES_COUNTS_GROUPS = create_bash_task_nested( group_id, 'A2_arteries_counts_groups') A3_STUDIES = create_bash_task_nested(group_id, 'A3_studies') A4_COUNTS2_STUDIES = create_bash_task_nested(group_id, 'A4_counts2_studies') A1_COUNTS_MULTIDAY_RUNS >> A2_ARTERIES_COUNTS_GROUPS A2_ARTERIES_COUNTS_GROUPS >> A3_STUDIES A3_STUDIES >> A4_COUNTS2_STUDIES """ open_data_tmcs Builds the [Traffic Volumes at Intersections for All Modes](https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/) dataset for the City of Toronto Open Data Portal. The dataset is exposed in two ways: via database, and via HTTP. We store the dataset as a series of views in the `open_data` schema. We also dump those views to CSV files at `/data/open_data`, which is served from `/open_data` on our ETL EC2 instances. This is intended to run after `group_multiday_counts`. """ with TaskGroup(group_id='open_data_tmcs') as open_data_tmcs: group_id = 'open_data_tmcs' A1_TMCS_COUNT_DATA = create_bash_task_nested(group_id, 'A1_tmcs_count_data') A1_TMCS_COUNT_METADATA = create_bash_task_nested( group_id, 'A1_tmcs_count_metadata') A2_TMCS_LOCATIONS = create_bash_task_nested(group_id, 'A2_tmcs_locations') A3_TMCS_JOINED = create_bash_task_nested(group_id, 'A3_tmcs_joined') A4_TMCS_DECADES = create_bash_task_nested(group_id, 'A4_tmcs_decades') A4_TMCS_PREVIEW = create_bash_task_nested(group_id, 'A4_tmcs_preview') A1_TMCS_COUNT_DATA >> A2_TMCS_LOCATIONS A1_TMCS_COUNT_METADATA >> A2_TMCS_LOCATIONS A2_TMCS_LOCATIONS >> A3_TMCS_JOINED A3_TMCS_JOINED >> A4_TMCS_DECADES A3_TMCS_JOINED >> A4_TMCS_PREVIEW replicator_update_schema >> copy_gis_layers replicator_update_schema >> copy_opendata_shapefiles [copy_gis_layers, copy_opendata_shapefiles] >> centreline_conflation_target [copy_gis_layers, copy_opendata_shapefiles] >> gis_layers_vector_tiles centreline_conflation_target >> location_search_index centreline_conflation_target >> centreline_vector_tiles centreline_conflation_target >> arteries_geocoding centreline_conflation_target >> crash_geocoding crash_geocoding >> collisions_vector_tiles arteries_geocoding >> group_multidirection_arteries group_multidirection_arteries >> group_multiday_counts group_multiday_counts >> open_data_tmcs
max_active_runs=3, schedule_interval="@daily", default_args={ "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=1), }, catchup=False, template_searchpath="/usr/local/airflow/include", ) as dag: t0 = DummyOperator(task_id="start") # Define Task Group with Postgres Queries with TaskGroup("covid_table_queries") as covid_table_queries: for state in states: generate_files = PostgresOperator( task_id="covid_query_{0}".format(state), postgres_conn_id="gpdb", sql="covid_state_query.sql", params={"state": "'" + state + "'"}, ) # Define task to send email send_email = EmailOperator( task_id="send_email", to=email_to, subject="Covid Greenplum Queries DAG", html_content= "<p>The Covid queries were run on Greenplum successfully. <p>",
default_args=args, schedule_interval=None, tags=['trigger'] ) task_start = DummyOperator(task_id='start_task', dag=dag) task_list = [DummyOperator(task_id='task_success_' + str(option), dag=dag) for option in range(1,5)] def make_skip(**kwargs): raise AirflowSkipException("Skip this task and individual downstream tasks while respecting trigger rules.") def make_fail(**kwargs): raise ValueError('Make Error Force') with TaskGroup("case_group", dag=dag) as case_group: task_skipped = PythonOperator( task_id='task_skipped', provide_context=True, python_callable=make_skip, dag=dag ) task_failed = PythonOperator( task_id='task_failed', provide_context=True, python_callable=make_fail, dag=dag ) task_all_success = DummyOperator(
if accaracy > 2: return ['accurate', 'in_accurate'] return 'in_accurate' with DAG('xcom_dag', schedule_interval='@daily', default_args=default_args, catchup=False) as dag: downloading_data = BashOperator(task_id='downloading_data', bash_command='sleep 3', do_xcom_push=False) with TaskGroup('processing_tasks') as processing_tasks: training_model_a = PythonOperator(task_id='training_model_a', python_callable=_training_model) training_model_b = PythonOperator(task_id='training_model_b', python_callable=_training_model) training_model_c = PythonOperator(task_id='training_model_c', python_callable=_training_model) choose_model = BranchPythonOperator(task_id='task_4', python_callable=_choose_best_model) accurate = DummyOperator(task_id='accurate') in_accurate = DummyOperator(task_id='in_accurate')
def test_sub_dag_task_group(): """ Tests dag.sub_dag() updates task_group correctly. """ execution_date = pendulum.parse("20200101") with DAG("test_test_task_group_sub_dag", start_date=execution_date) as dag: task1 = DummyOperator(task_id="task1") with TaskGroup("group234") as group234: _ = DummyOperator(task_id="task2") with TaskGroup("group34") as group34: _ = DummyOperator(task_id="task3") _ = DummyOperator(task_id="task4") with TaskGroup("group6") as group6: _ = DummyOperator(task_id="task6") task7 = DummyOperator(task_id="task7") task5 = DummyOperator(task_id="task5") task1 >> group234 group34 >> task5 group234 >> group6 group234 >> task7 subdag = dag.sub_dag(task_ids_or_regex="task5", include_upstream=True, include_downstream=False) assert extract_node_id(task_group_to_dict(subdag.task_group)) == { 'id': None, 'children': [ { 'id': 'group234', 'children': [ { 'id': 'group234.group34', 'children': [ { 'id': 'group234.group34.task3' }, { 'id': 'group234.group34.task4' }, { 'id': 'group234.group34.downstream_join_id' }, ], }, { 'id': 'group234.upstream_join_id' }, ], }, { 'id': 'task1' }, { 'id': 'task5' }, ], } edges = dag_edges(subdag) assert sorted((e["source_id"], e["target_id"]) for e in edges) == [ ('group234.group34.downstream_join_id', 'task5'), ('group234.group34.task3', 'group234.group34.downstream_join_id'), ('group234.group34.task4', 'group234.group34.downstream_join_id'), ('group234.upstream_join_id', 'group234.group34.task3'), ('group234.upstream_join_id', 'group234.group34.task4'), ('task1', 'group234.upstream_join_id'), ] subdag_task_groups = subdag.task_group.get_task_group_dict() assert subdag_task_groups.keys() == {None, "group234", "group234.group34"} included_group_ids = {"group234", "group234.group34"} included_task_ids = { 'group234.group34.task3', 'group234.group34.task4', 'task1', 'task5' } for task_group in subdag_task_groups.values(): assert task_group.upstream_group_ids.issubset(included_group_ids) assert task_group.downstream_group_ids.issubset(included_group_ids) assert task_group.upstream_task_ids.issubset(included_task_ids) assert task_group.downstream_task_ids.issubset(included_task_ids) for task in subdag.task_group: assert task.upstream_task_ids.issubset(included_task_ids) assert task.downstream_task_ids.issubset(included_task_ids)
def methodPrint(n): print('This is odd ::' + str(n)) def the_end(): print('The End') with DAG(dag_id='TaskGroup_BranchPythonOperator', schedule_interval=None, start_date=days_ago(2)) as dag: task_1 = PythonOperator(task_id='task_1', python_callable=method1) task_2 = BranchPythonOperator(task_id='task_2', python_callable=method2) with TaskGroup('group1') as group1: task_x = PythonOperator(task_id='task_x', python_callable=printMethod, op_kwargs={'n': 1}) task_n = [ PythonOperator(task_id=f'task_{i}', python_callable=printMethod, op_kwargs={'n': i}) for i in range(2, 6) ] task_x >> task_n with TaskGroup('group2') as group2: task_x = PythonOperator(task_id='task_x', python_callable=methodPrint,
dih_stg 2. Creates all tables in filter database. linqdm_filter 3. Creates all tables in Fact database. linqdm_fdn 4. Creates all tables in Base database for now this database will have only DIH tables. dih """ # Set the batch id from Airflow dag run setbatch = getpythonoperator("BatchId", getBatchId) batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}" for db in database: with TaskGroup(group_id="{}_Tab".format(db)) as run_stage0: stagetaskgrp = [] with TaskGroup(group_id="{}_S2HS".format(db)) as run_stage1: for tabname in database[db]["tabname"]: taskname = "CRT_{}_{}".format(db, tabname) taskid = 'TA_' + taskname commands = "base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -oGSSAPIDelegateCredentials=yes {}@{} '{}'".format(password, kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "{} {} {} {} {} {}".format(scriptpaths["hiveload"], tabname , batchid, 'ddl', db, database[db]["type"])) ssh_create_stage = getbashoperator(taskname, False, commands) ssh_create_stage stagetaskgrp.append(run_stage1) run_stage1 group.append(run_stage0) dummyop = DummyOperator(task_id='NoOP')
with DAG( 'covid_data_to_s3', start_date=datetime(2019, 1, 1), max_active_runs=1, # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs schedule_interval='@daily', default_args=default_args, catchup=False # enable if you don't want historical dag runs to run ) as dag: t0 = DummyOperator(task_id='start') send_email = EmailOperator( task_id='send_email', to=email_to, subject='Covid to S3 DAG', html_content= '<p>The Covid to S3 DAG completed successfully. Files can now be found on S3. <p>' ) with TaskGroup('covid_task_group') as covid_group: for endpoint in endpoints: generate_files = PythonOperator( task_id='generate_file_{0}'.format(endpoint), python_callable=upload_to_s3, op_kwargs={ 'endpoint': endpoint, 'date': date }) t0 >> covid_group >> send_email
DAG_ID = os.path.basename(__file__).replace('.py', '') DEFAULT_ARGS = { 'owner': 'airflow', 'depends_on_past': False, 'email': ['*****@*****.**'], 'email_on_failure': False, 'email_on_retry': False, } dag_args = { "dag_id": DAG_ID, "description": 'Run built-in Spark app on Amazon EMR', "default_args": DEFAULT_ARGS, "dagrun_timeout": timedelta(hours=2), "start_date": datetime(2020, 1, 1), "schedule_interval": '@once', "tags": ['emr'], } with DAG(**dag_args) as dag: task_1 = BashOperator(task_id='task_1', bash_command='sleep 3') with TaskGroup('processing_tasks') as parallel_task: task_2 = BashOperator(task_id='task_2', bash_command='sleep 3') task_3 = BashOperator(task_id='task_3', bash_command='sleep 3') task_4 = BashOperator(task_id='task_4', bash_command='sleep 3') task_1 >> parallel_task >> task_4
} with DAG( dag_id="example_from_home_estate_nybolig_boliga", description="Populate data from home.dk estate.dk and nybolig.dk", default_args=args, schedule_interval="@daily", start_date=datetime(2021, 5, 1, 22, 45), catchup=False, max_active_runs=4, tags=["estate_data"], ) as dag: start = DummyOperator(task_id="start") with TaskGroup("home", tooltip="Tasks for Home") as home: home_scraper_section() estate = ScrapEstateOperator( task_id="estate", url="https://www.estate.dk/Services/PropertySearch/Search", api_name="estate.dk", scraper_cls=Estate, params=params, ) nybolig = ScrapEstateOperator( task_id="nybolig", url="https://www.nybolig.dk/Services/PropertySearch/Search", api_name="nybolig.dk", scraper_cls=Nybolig,
from airflow.models.dag import DAG from airflow.operators.bash import BashOperator from airflow.operators.dummy import DummyOperator from airflow.operators.python import PythonOperator from airflow.utils.dates import days_ago from airflow.utils.task_group import TaskGroup from dags.dag01.model import load_dump, querys_no_return, querys_with_return with DAG(dag_id="dag01", start_date=days_ago(1), tags=["test"]) as dag: start = PythonOperator(task_id="start", trigger_rule='all_success', python_callable=querys_with_return) with TaskGroup("section_1", tooltip="pipeline xpto") as section_1: task_load_dump = PythonOperator(task_id="task_load_dump", trigger_rule='all_success', python_callable=load_dump) task_querys_with_return = PythonOperator( task_id="task_querys_with_return", trigger_rule='all_success', python_callable=querys_with_return) task_querys_no_return = PythonOperator( task_id="task_querys_no_return", trigger_rule='all_success', python_callable=querys_no_return) task_load_dump >> [task_querys_with_return, task_querys_no_return]
with DAG( dag_id="demo_xcom", description="This is a DAG to demo how x_com works", catchup=False, max_active_runs=1, schedule_interval=timedelta(days=1), default_args=default_args, ) as dag: task_1 = BashOperator( dag=dag, task_id="task_1", do_xcom_push=False, bash_command="sleep 2; echo This is Task 1", ) with TaskGroup("processing_tasks") as processing: task_2 = PythonOperator( dag=dag, task_id="task_2", python_callable=_return_in_default ) task_3 = PythonOperator( dag=dag, task_id="task_3", python_callable=_return_via_ti ) task_4 = PythonOperator( dag=dag, task_id="task_4", python_callable=_return_via_ti ) task_5 = PythonOperator( dag=dag, task_id="task_5", python_callable=_pick_out_smaller ) task_1 >> processing >> task_5
def test_build_task_group_with_prefix(): """ Tests that prefix_group_id turns on/off prefixing of task_id with group_id. """ execution_date = pendulum.parse("20200101") with DAG("test_build_task_group_with_prefix", start_date=execution_date) as dag: task1 = DummyOperator(task_id="task1") with TaskGroup("group234", prefix_group_id=False) as group234: task2 = DummyOperator(task_id="task2") with TaskGroup("group34") as group34: task3 = DummyOperator(task_id="task3") with TaskGroup("group4", prefix_group_id=False) as group4: task4 = DummyOperator(task_id="task4") task5 = DummyOperator(task_id="task5") task1 >> group234 group34 >> task5 assert task2.task_id == "task2" assert group34.group_id == "group34" assert task3.task_id == "group34.task3" assert group4.group_id == "group34.group4" assert task4.task_id == "task4" assert task5.task_id == "task5" assert group234.get_child_by_label("task2") == task2 assert group234.get_child_by_label("group34") == group34 assert group4.get_child_by_label("task4") == task4 assert extract_node_id( task_group_to_dict(dag.task_group), include_label=True) == { 'id': None, 'label': None, 'children': [ { 'id': 'group234', 'label': 'group234', 'children': [ { 'id': 'group34', 'label': 'group34', 'children': [ { 'id': 'group34.group4', 'label': 'group4', 'children': [{ 'id': 'task4', 'label': 'task4' }], }, { 'id': 'group34.task3', 'label': 'task3' }, { 'id': 'group34.downstream_join_id', 'label': '' }, ], }, { 'id': 'task2', 'label': 'task2' }, { 'id': 'group234.upstream_join_id', 'label': '' }, ], }, { 'id': 'task1', 'label': 'task1' }, { 'id': 'task5', 'label': 'task5' }, ], }
def test_build_task_group_with_task_decorator(): """ Test that TaskGroup can be used with the @task decorator. """ from airflow.operators.python import task @task def task_1(): print("task_1") @task def task_2(): return "task_2" @task def task_3(): return "task_3" @task def task_4(task_2_output, task_3_output): print(task_2_output, task_3_output) @task def task_5(): print("task_5") execution_date = pendulum.parse("20200101") with DAG("test_build_task_group_with_task_decorator", start_date=execution_date) as dag: tsk_1 = task_1() with TaskGroup("group234") as group234: tsk_2 = task_2() tsk_3 = task_3() tsk_4 = task_4(tsk_2, tsk_3) tsk_5 = task_5() tsk_1 >> group234 >> tsk_5 # pylint: disable=no-member assert tsk_1.operator in tsk_2.operator.upstream_list assert tsk_1.operator in tsk_3.operator.upstream_list assert tsk_5.operator in tsk_4.operator.downstream_list # pylint: enable=no-member assert extract_node_id(task_group_to_dict(dag.task_group)) == { 'id': None, 'children': [ { 'id': 'group234', 'children': [ { 'id': 'group234.task_2' }, { 'id': 'group234.task_3' }, { 'id': 'group234.task_4' }, { 'id': 'group234.upstream_join_id' }, { 'id': 'group234.downstream_join_id' }, ], }, { 'id': 'task_1' }, { 'id': 'task_5' }, ], } edges = dag_edges(dag) assert sorted((e["source_id"], e["target_id"]) for e in edges) == [ ('group234.downstream_join_id', 'task_5'), ('group234.task_2', 'group234.task_4'), ('group234.task_3', 'group234.task_4'), ('group234.task_4', 'group234.downstream_join_id'), ('group234.upstream_join_id', 'group234.task_2'), ('group234.upstream_join_id', 'group234.task_3'), ('task_1', 'group234.upstream_join_id'), ]
from airflow import DAG from airflow.operators.dummy_operator import DummyOperator from airflow.operators.bash_operator import BashOperator from airflow.utils.task_group import TaskGroup from airflow.utils.dates import days_ago from subdag_factory import subdag_factory default_args = {'start_date': days_ago(1)} with DAG('taskgroup', schedule_interval='@daily', default_args=default_args, catchup=False) as dag: extracting = DummyOperator(task_id='extracting') with TaskGroup('processing_task_group') as processing_group: for l in ['A', 'B', 'C']: BashOperator(task_id=f'processing_{l}', bash_command='ls') loading = DummyOperator(task_id='loading') extracting >> processing_group >> loading
def test_dag_edges(): execution_date = pendulum.parse("20200101") with DAG("test_dag_edges", start_date=execution_date) as dag: task1 = DummyOperator(task_id="task1") with TaskGroup("group_a") as group_a: with TaskGroup("group_b") as group_b: task2 = DummyOperator(task_id="task2") task3 = DummyOperator(task_id="task3") task4 = DummyOperator(task_id="task4") task2 >> [task3, task4] task5 = DummyOperator(task_id="task5") task5 << group_b task1 >> group_a with TaskGroup("group_c") as group_c: task6 = DummyOperator(task_id="task6") task7 = DummyOperator(task_id="task7") task8 = DummyOperator(task_id="task8") [task6, task7] >> task8 group_a >> group_c task5 >> task8 task9 = DummyOperator(task_id="task9") task10 = DummyOperator(task_id="task10") group_c >> [task9, task10] with TaskGroup("group_d") as group_d: task11 = DummyOperator(task_id="task11") task12 = DummyOperator(task_id="task12") task11 >> task12 group_d << group_c nodes = task_group_to_dict(dag.task_group) edges = dag_edges(dag) assert extract_node_id(nodes) == { 'id': None, 'children': [ { 'id': 'group_a', 'children': [ { 'id': 'group_a.group_b', 'children': [ { 'id': 'group_a.group_b.task2' }, { 'id': 'group_a.group_b.task3' }, { 'id': 'group_a.group_b.task4' }, { 'id': 'group_a.group_b.downstream_join_id' }, ], }, { 'id': 'group_a.task5' }, { 'id': 'group_a.upstream_join_id' }, { 'id': 'group_a.downstream_join_id' }, ], }, { 'id': 'group_c', 'children': [ { 'id': 'group_c.task6' }, { 'id': 'group_c.task7' }, { 'id': 'group_c.task8' }, { 'id': 'group_c.upstream_join_id' }, { 'id': 'group_c.downstream_join_id' }, ], }, { 'id': 'group_d', 'children': [ { 'id': 'group_d.task11' }, { 'id': 'group_d.task12' }, { 'id': 'group_d.upstream_join_id' }, ], }, { 'id': 'task1' }, { 'id': 'task10' }, { 'id': 'task9' }, ], } assert sorted((e["source_id"], e["target_id"]) for e in edges) == [ ('group_a.downstream_join_id', 'group_c.upstream_join_id'), ('group_a.group_b.downstream_join_id', 'group_a.task5'), ('group_a.group_b.task2', 'group_a.group_b.task3'), ('group_a.group_b.task2', 'group_a.group_b.task4'), ('group_a.group_b.task3', 'group_a.group_b.downstream_join_id'), ('group_a.group_b.task4', 'group_a.group_b.downstream_join_id'), ('group_a.task5', 'group_a.downstream_join_id'), ('group_a.task5', 'group_c.task8'), ('group_a.upstream_join_id', 'group_a.group_b.task2'), ('group_c.downstream_join_id', 'group_d.upstream_join_id'), ('group_c.downstream_join_id', 'task10'), ('group_c.downstream_join_id', 'task9'), ('group_c.task6', 'group_c.task8'), ('group_c.task7', 'group_c.task8'), ('group_c.task8', 'group_c.downstream_join_id'), ('group_c.upstream_join_id', 'group_c.task6'), ('group_c.upstream_join_id', 'group_c.task7'), ('group_d.task11', 'group_d.task12'), ('group_d.upstream_join_id', 'group_d.task11'), ('task1', 'group_a.upstream_join_id'), ]
def create_dag(dag_id, schedule, window, default_args): with DAG( dag_id, default_args=default_args, description='creates sliding windows based on months', schedule_interval=schedule, start_date=datetime.datetime(2021, 4, 30), on_failure_callback=dag_fail_slack_alert, on_success_callback=dag_success_slack_alert, tags=['selection', 'sliding'], ) as dag: OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows-bealign/" + '_'.join( window) default_args["params"]["output-dir"] = OUTPUT_DIR default_args["params"][ "meta-output"] = OUTPUT_DIR + '/master-no-sequences.json' default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences' with open(dag.params["region_cfg"], 'r') as stream: regions = yaml.safe_load(stream) mk_dir_task = BashOperator( task_id='make_directory', bash_command='mkdir -p {{params.output}}', params={'output': default_args['params']['output-dir']}, dag=dag, ) export_meta_task = PythonOperator( task_id='export_meta', python_callable=export_meta, op_kwargs={"config": default_args['params']}, pool='mongo', dag=dag, ) export_meta_task.set_upstream(mk_dir_task) export_sequences_task = PythonOperator( task_id='export_sequences', python_callable=export_sequences, op_kwargs={"config": default_args['params']}, pool='mongo', dag=dag, ) export_sequences_task.set_upstream(mk_dir_task) # For each region export_by_gene = [] for gene in regions.keys(): filepath_prefix = OUTPUT_DIR + '/sequences.' + gene nuc_sequence_output = filepath_prefix + '_nuc.fas' uniques_fn = filepath_prefix + '_nuc.uniques.fas' duplicate_output = filepath_prefix + '.duplicates.json' variants_csv_output = filepath_prefix + '.variants.csv' variants_json_output = filepath_prefix + '.variants.json' filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas' filtered_json_output = filepath_prefix + '.filtered.json' output_edits_fn = filepath_prefix + '.filtered.edits.json' compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json' tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree' sto_output = filepath_prefix + '.compressed.filtered.sto' slac_output_fn = filepath_prefix + '.SLAC.json' fel_output_fn = filepath_prefix + '.FEL.json' meme_output_fn = filepath_prefix + '.MEME.json' summary_output_fn = filepath_prefix + '.json' default_args["params"]["nuc-sequence-output"] = nuc_sequence_output default_args["params"]["duplicate-output"] = duplicate_output with TaskGroup(f"alignment_{gene}") as alignment: export_bealign_task = PythonOperator( task_id=f'export_bealign', python_callable=export_bealign_sequences, op_kwargs={ "config": default_args['params'], 'nuc_output_fn': nuc_sequence_output, 'gene': gene }, dag=dag, ) # Occasional errors when cleaning up tmp files, so or'ing true cleanup_task = BashOperator( task_id=f'cleanup', bash_command= "sed -i '/^>/! s/[^ACTG-]/N/g' $NUC_OUTPUT_FN || true", env={ 'NUC_OUTPUT_FN': nuc_sequence_output, **os.environ }, dag=dag) export_bealign_task >> cleanup_task with TaskGroup(f"duplicates_{gene}") as duplicates_group: compute_duplicates_task = PythonOperator( task_id=f'write_raw_duplicates', python_callable=write_nuc_raw_duplicates, op_kwargs={ "input": nuc_sequence_output, "duplicate_output": duplicate_output, 'uniques_output': uniques_fn }, dag=dag, ) compute_duplicates_task # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json with TaskGroup(f"filter_{gene}") as filter: COMPRESSOR = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $FASTA_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT """ compressor_task = BashOperator(task_id=f'compressor', bash_command=COMPRESSOR, env={ 'FASTA_FN': uniques_fn, 'DUPLICATE_FN': duplicate_output, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'COMPRESSOR_DUPLICATE_OUT': compressor_duplicate_out, **os.environ }, dag=dag) # --output-edits ${FILE}.${GENE}.filtered.edits.json COMPRESSOR2 = """ {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $FASTA_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS} """ compressor_two_task = BashOperator( task_id=f'compressor_two', bash_command=COMPRESSOR2, env={ 'FASTA_FN': uniques_fn, 'DUPLICATE_FN': compressor_duplicate_out, 'VARIANTS_CSV_FN': variants_csv_output, 'VARIANTS_JSON_FN': variants_json_output, 'FILTERED_FASTA_FN': filtered_fasta_output, 'FILTERED_JSON_FN': filtered_json_output, 'OUTPUT_EDITS': output_edits_fn, **os.environ }, dag=dag) compressor_task >> compressor_two_task INFER_TREE = """ seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT; rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT sed -i "s/'//g" $TREE_OUTPUT; """ infer_tree_task = BashOperator(task_id=f'infer_tree_{gene}', bash_command=INFER_TREE, env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'STO_OUTPUT': sto_output, 'TREE_OUTPUT': tree_output, **os.environ }, dag=dag) slac_task = BashOperator( task_id=f'slac_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'SLAC_OUTPUT': slac_output_fn, **os.environ }, dag=dag, ) big_data_flags = '--full-model No' fel_task = BashOperator( task_id=f'fel_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'FEL_OUTPUT': fel_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ }, dag=dag, ) meme_task = BashOperator( task_id=f'meme_{gene}', bash_command= "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT", env={ 'FILTERED_FASTA_FN': filtered_fasta_output, 'TREE_OUTPUT': tree_output, 'MEME_OUTPUT': meme_output_fn, 'BIG_DATA_FLAGS': big_data_flags, **os.environ }, dag=dag, ) annotation_file = filepath_prefix + '.annotation.json' copy_annotation_task = BashOperator( task_id=f'copy_annotation_{gene}', bash_command= 'cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}', params={ 'annotation_file': annotation_file, 'working_dir': WORKING_DIR }, dag=dag) summarize_gene_task = BashOperator( task_id=f'summarize_gene_{gene}', bash_command= '{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION', params={ 'python': default_args['params']['python'], 'working_dir': WORKING_DIR }, env={ 'MASTERNOFASTA': default_args["params"]["meta-output"], 'DUPLICATES': duplicate_output, 'SLAC_OUTPUT': slac_output_fn, 'FEL_OUTPUT': fel_output_fn, 'MEME_OUTPUT': meme_output_fn, 'SUMMARY_OUTPUT': summary_output_fn, 'COMPRESSED_OUTPUT_FN': filtered_fasta_output, 'FRAGMENT': str(regions[gene]['fragment']), 'ADDSHIFT': str(regions[gene]['add_one']), 'SHIFT': str(regions[gene]['shift']), 'OFFSET': str(regions[gene]['offset']), 'ANNOTATION': annotation_file, **os.environ }, dag=dag, ) summarize_gene_task.set_upstream(export_meta_task) alignment.set_upstream(export_sequences_task) export_by_gene.append( alignment >> duplicates_group >> filter >> infer_tree_task >> [ slac_task, fel_task, meme_task ] >> copy_annotation_task >> summarize_gene_task) dag.doc_md = __doc__ # Add export meta and export sequence tasks to be executed in parallel cross_downstream([export_meta_task, export_sequences_task], export_by_gene) return dag
from airflow.operators.bash import BashOperator from airflow.operators.subdag import SubDagOperator from datetime import datetime from airflow.utils.task_group import TaskGroup import json from subdags.subdag_parallel_dag import subdag_parallel_dag default_args = {'start_date': datetime(2020, 1, 1)} with DAG('parallel_task_group', schedule_interval='@daily', default_args=default_args, catchup=False) as dag: task_1 = BashOperator(task_id='task_1', bash_command="sleep 3") with TaskGroup("processing_task") as processing_task: task_2 = BashOperator(task_id='task_2', bash_command="sleep 3") with TaskGroup("Spark_task") as Spark_task: task_2 = BashOperator(task_id='task_2', bash_command="sleep 3") with TaskGroup("flink_task") as flink_task: task_3 = BashOperator(task_id='task_3', bash_command="sleep 3") task_4 = BashOperator(task_id='task_4', bash_command="sleep 3") task_1 >> processing_task >> task_4
uniques_fn = filepath_prefix + '.uniques.fas' duplicate_output = filepath_prefix + '.duplicates.json' variants_csv_output = filepath_prefix + '.variants.csv' variants_json_output = filepath_prefix + '.variants.json' filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas' filtered_json_output = filepath_prefix + '.filtered.json' output_edits_fn = filepath_prefix + '.filtered.edits.json' tn93_output = filepath_prefix + '.tn93.csv' compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json' tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree' sto_output = filepath_prefix + '.compressed.filtered.sto' with TaskGroup(f"alignment_{month_str}_{gene}") as alignment: # Occasional errors when cleaning up tmp files, so or'ing true cleanup_task = BashOperator( task_id=f'cleanup', bash_command= "sed -i '/^>/! s/[^ACTG-]/N/g' $NUC_OUTPUT_FN || true", env={ 'NUC_OUTPUT_FN': nuc_sequence_output, **os.environ }, dag=dag) cleanup_task with TaskGroup(
} # -------------------------------------------------------------------------------- # Main DAG # -------------------------------------------------------------------------------- with models.DAG('delete_tables_dag', default_args=default_args, schedule_interval=None) as dag: start = dummy.DummyOperator(task_id='start', trigger_rule='all_success') end = dummy.DummyOperator(task_id='end', trigger_rule='all_success') # Bigquery Tables deleted here for demo porpuse. # Consider a dedicated pipeline or tool for a real life scenario. with TaskGroup('delete_table') as delte_table: delete_table_customers = BigQueryDeleteTableOperator( task_id="delete_table_customers", deletion_dataset_table=DWH_LAND_PRJ + "." + DWH_LAND_BQ_DATASET + ".customers", impersonation_chain=[TRF_SA_DF]) delete_table_purchases = BigQueryDeleteTableOperator( task_id="delete_table_purchases", deletion_dataset_table=DWH_LAND_PRJ + "." + DWH_LAND_BQ_DATASET + ".purchases", impersonation_chain=[TRF_SA_DF]) delete_table_customer_purchase_curated = BigQueryDeleteTableOperator( task_id="delete_table_customer_purchase_curated", deletion_dataset_table=DWH_CURATED_PRJ + "." +
from subdags.subdag_parallel_dag import subdag_parallel_dag from airflow.utils.task_group import TaskGroup default_args = { 'start_date': datetime(2020, 1, 1) } with DAG ('parallel_dag', schedule_interval='@daily', default_args=default_args, catchup=False) as dag: task1 = BashOperator( task_id='task1', bash_command='sleep 3' ) with TaskGroup('processing_tasks') as processing_tasks: task2 = BashOperator( task_id='task2', bash_command='sleep 3' ) with TaskGroup('spark_tasks') as spark_tasks: task3 = BashOperator( task_id='task3', bash_command='sleep 3' ) # processing = SubDagOperator( # task_id = 'processing_tasks', # subdag=subdag_parallel_dag('parallel_dag','processing_tasks',default_args) # )
def deserialize_dag(cls, encoded_dag: Dict[str, Any]) -> 'SerializedDAG': """Deserializes a DAG from a JSON object.""" dag = SerializedDAG(dag_id=encoded_dag['_dag_id']) for k, v in encoded_dag.items(): if k == "_downstream_task_ids": v = set(v) elif k == "tasks": SerializedBaseOperator._load_operator_extra_links = cls._load_operator_extra_links v = { task["task_id"]: SerializedBaseOperator.deserialize_operator(task) for task in v } k = "task_dict" elif k == "timezone": v = cls._deserialize_timezone(v) elif k == "dagrun_timeout": v = cls._deserialize_timedelta(v) elif k.endswith("_date"): v = cls._deserialize_datetime(v) elif k == "edge_info": # Value structure matches exactly pass elif k == "timetable": v = _decode_timetable(v) elif k in cls._decorated_fields: v = cls._deserialize(v) elif k == "params": v = cls._deserialize_params_dict(v) # else use v as it is setattr(dag, k, v) # A DAG is always serialized with only one of schedule_interval and # timetable. This back-populates the other to ensure the two attributes # line up correctly on the DAG instance. if "timetable" in encoded_dag: dag.schedule_interval = dag.timetable.summary else: dag.timetable = create_timetable(dag.schedule_interval, dag.timezone) # Set _task_group if "_task_group" in encoded_dag: dag._task_group = SerializedTaskGroup.deserialize_task_group( # type: ignore encoded_dag["_task_group"], None, dag.task_dict) else: # This must be old data that had no task_group. Create a root TaskGroup and add # all tasks to it. dag._task_group = TaskGroup.create_root(dag) for task in dag.tasks: dag.task_group.add(task) # Set has_on_*_callbacks to True if they exist in Serialized blob as False is the default if "has_on_success_callback" in encoded_dag: dag.has_on_success_callback = True if "has_on_failure_callback" in encoded_dag: dag.has_on_failure_callback = True keys_to_set_none = dag.get_serialized_fields() - encoded_dag.keys( ) - cls._CONSTRUCTOR_PARAMS.keys() for k in keys_to_set_none: setattr(dag, k, None) for task in dag.task_dict.values(): task.dag = dag serializable_task: BaseOperator = task for date_attr in ["start_date", "end_date"]: if getattr(serializable_task, date_attr) is None: setattr(serializable_task, date_attr, getattr(dag, date_attr)) if serializable_task.subdag is not None: setattr(serializable_task.subdag, 'parent_dag', dag) serializable_task.subdag.is_subdag = True for task_id in serializable_task.downstream_task_ids: # Bypass set_upstream etc here - it does more than we want dag.task_dict[task_id]._upstream_task_ids.add( serializable_task.task_id) return dag
def _add_benchmarks(self, task_group): with TaskGroup(task_group, prefix_group_id=True, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks(task_group).get_benchmarks() chain(*benchmark_tasks) return benchmarks
def deserialize_dag(cls, encoded_dag: Dict[str, Any]) -> 'SerializedDAG': """Deserializes a DAG from a JSON object.""" dag = SerializedDAG(dag_id=encoded_dag['_dag_id']) for k, v in encoded_dag.items(): if k == "_downstream_task_ids": v = set(v) elif k == "tasks": # pylint: disable=protected-access SerializedBaseOperator._load_operator_extra_links = cls._load_operator_extra_links # pylint: enable=protected-access v = { task["task_id"]: SerializedBaseOperator.deserialize_operator(task) for task in v } k = "task_dict" elif k == "timezone": v = cls._deserialize_timezone(v) elif k in {"dagrun_timeout"}: v = cls._deserialize_timedelta(v) elif k.endswith("_date"): v = cls._deserialize_datetime(v) elif k == "edge_info": # Value structure matches exactly pass elif k in cls._decorated_fields: v = cls._deserialize(v) # else use v as it is setattr(dag, k, v) # Set _task_group # pylint: disable=protected-access if "_task_group" in encoded_dag: dag._task_group = SerializedTaskGroup.deserialize_task_group( # type: ignore encoded_dag["_task_group"], None, dag.task_dict) else: # This must be old data that had no task_group. Create a root TaskGroup and add # all tasks to it. dag._task_group = TaskGroup.create_root(dag) for task in dag.tasks: dag.task_group.add(task) # pylint: enable=protected-access # Set has_on_*_callbacks to True if they exist in Serialized blob as False is the default if "has_on_success_callback" in encoded_dag: dag.has_on_success_callback = True if "has_on_failure_callback" in encoded_dag: dag.has_on_failure_callback = True keys_to_set_none = dag.get_serialized_fields() - encoded_dag.keys( ) - cls._CONSTRUCTOR_PARAMS.keys() for k in keys_to_set_none: setattr(dag, k, None) setattr(dag, 'full_filepath', dag.fileloc) for task in dag.task_dict.values(): task.dag = dag serializable_task: BaseOperator = task for date_attr in ["start_date", "end_date"]: if getattr(serializable_task, date_attr) is None: setattr(serializable_task, date_attr, getattr(dag, date_attr)) if serializable_task.subdag is not None: setattr(serializable_task.subdag, 'parent_dag', dag) serializable_task.subdag.is_subdag = True for task_id in serializable_task.downstream_task_ids: # Bypass set_upstream etc here - it does more than we want # noqa: E501 # pylint: disable=protected-access dag.task_dict[task_id]._upstream_task_ids.add( serializable_task.task_id) return dag
def _make_task_group(self, **kwargs) -> TaskGroup: return TaskGroup(**kwargs)
default_args = { 'owner': 'teste', 'depends_on_past': False, 'start_date': datetime(2019, 1, 1), 'retries': 0, } with DAG('dag-pipeline-iris-aula-v1', schedule_interval=timedelta(minutes=10), catchup=False, default_args=default_args) as dag: start = DummyOperator(task_id="start") with TaskGroup("etl", tooltip="etl") as etl: t1 = BashOperator(dag=dag, task_id='download_dataset', bash_command=""" cd {0}/featurestore curl -o iris.txt https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data """.format(pathScript)) [t1] with TaskGroup("preProcessing", tooltip="preProcessing") as preProcessing: t2 = BashOperator(dag=dag, task_id='encoder_dataset', bash_command=""" cd {0}
with open(tempfile, 'r') as f: cursor = conn.cursor() cursor.copy_expert(query, f) conn.commit() finally: conn.close() os.remove(tempfile) with DAG(dag_id=dag_id, schedule_interval=None, catchup=False, start_date=days_ago(1)) as dag: pause_dags_t = PythonOperator( task_id="pause_dags", python_callable=pause_dags ) with TaskGroup(group_id='import') as import_t: for x in OBJECTS_TO_IMPORT: load_task = PythonOperator( task_id=x[1], python_callable=load_data, op_kwargs={'query': x[0], 'file': x[1]}, provide_context=True ) load_variable_t = PythonOperator( task_id="variable", python_callable=importVariable ) load_task_instance_t = PythonOperator( task_id="load_ti", op_kwargs={'query': TASK_INSTANCE_IMPORT, 'file': 'task_instance.csv'},