Пример #1
0
def test_duplicate_group_id():
    from airflow.exceptions import DuplicateTaskIdFound

    execution_date = pendulum.parse("20200101")

    with pytest.raises(DuplicateTaskIdFound, match=r".* 'task1' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            _ = DummyOperator(task_id="task1")
            with TaskGroup("task1"):
                pass

    with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            _ = DummyOperator(task_id="task1")
            with TaskGroup("group1", prefix_group_id=False):
                with TaskGroup("group1"):
                    pass

    with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            with TaskGroup("group1", prefix_group_id=False):
                _ = DummyOperator(task_id="group1")

    with pytest.raises(DuplicateTaskIdFound,
                       match=r".* 'group1.downstream_join_id' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            _ = DummyOperator(task_id="task1")
            with TaskGroup("group1"):
                _ = DummyOperator(task_id="downstream_join_id")

    with pytest.raises(DuplicateTaskIdFound,
                       match=r".* 'group1.upstream_join_id' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            _ = DummyOperator(task_id="task1")
            with TaskGroup("group1"):
                _ = DummyOperator(task_id="upstream_join_id")
Пример #2
0
 def get_benchmarks(self):
     benchmarks = self._get_benchmarks(self.vars["benchmarks"])
     with TaskGroup("Index Results", prefix_group_id=False,
                    dag=self.dag) as post_steps:
         indexers = self._add_indexers(benchmarks)
     return benchmarks
Пример #3
0
# under the License.
"""Example DAG demonstrating the usage of the TaskGroup."""

from airflow.models.dag import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago
from airflow.utils.task_group import TaskGroup

# [START howto_task_group]
with DAG(dag_id="example_task_group", start_date=days_ago(2),
         tags=["example"]) as dag:
    start = DummyOperator(task_id="start")

    # [START howto_task_group_section_1]
    with TaskGroup("section_1", tooltip="Tasks for section_1") as section_1:
        task_1 = DummyOperator(task_id="task_1")
        task_2 = BashOperator(task_id="task_2", bash_command='echo 1')
        task_3 = DummyOperator(task_id="task_3")

        task_1 >> [task_2, task_3]
    # [END howto_task_group_section_1]

    # [START howto_task_group_section_2]
    with TaskGroup("section_2", tooltip="Tasks for section_2") as section_2:
        task_1 = DummyOperator(task_id="task_1")

        # [START howto_task_group_inner_section_2]
        with TaskGroup("inner_section_2",
                       tooltip="Tasks for inner_section2") as inner_section_2:
            task_2 = BashOperator(task_id="task_2", bash_command='echo 1')
Пример #4
0
def taskflow_dag():
    # Update replicator tables
    # This task group will take the new tables created by the on-prem replicator under the schema TRAFFIC_NEW
    # and alter the schema to TRAFFIC_INTER and then create the materialized view TRAFFIC on top
    # but it will only do this is the TRAFFIC_NEW table exists (ie. a new dataset was written)
    with TaskGroup(
            group_id="replicator_update_schema") as replicator_update_schema:
        group_id = "replicator_update_schema"

        ACC = create_bash_task_nested(group_id, 'acc')
        ARC_LINK = create_bash_task_nested(group_id, 'arc_link')
        ARTERYDATA = create_bash_task_nested(group_id, 'arterydata')
        CATEGORY = create_bash_task_nested(group_id, "category")
        CNT_DET = create_bash_task_nested(group_id, 'cnt_det')
        CNT_SPD = create_bash_task_nested(group_id, 'cnt_spd')
        COUTNINFO = create_bash_task_nested(group_id, 'countinfo')
        COUNTINFOMICS = create_bash_task_nested(group_id, 'countinfomics')
        DET = create_bash_task_nested(group_id, 'det')
        NODE = create_bash_task_nested(group_id, 'node')

        ACC >> ARC_LINK
        ARC_LINK >> ARTERYDATA
        ARTERYDATA >> CATEGORY
        CATEGORY >> CNT_DET
        CNT_DET >> CNT_SPD
        CNT_SPD >> COUTNINFO
        COUTNINFO >> COUNTINFOMICS
        COUNTINFOMICS >> DET
        DET >> NODE
        NODE

    # GCC's ArcGIS REST API server exposes a series of "services", each with a name like
    # `cot_geospatial2`.  Within those services, individual layers have an ID
    # (in parentheses, after the layer name).
    with TaskGroup(group_id="copy_gis_layers") as copy_gis_layers:
        TASKS = {
            'bikeway': ('cot_geospatial2', 2),
            'accessible_signal': ('cot_geospatial2', 4),
            'pedestrian_crossover': ('cot_geospatial2', 7),
            'traffic_signal': ('cot_geospatial2', 9),
            'hospital': ('cot_geospatial10', 21),
            'toinview_program_point': ('cot_geospatial12', 46),
            'toinview_program_line': ('cot_geospatial12', 47),
            'toinview_program_polygon': ('cot_geospatial12', 48),
            'school': ('cot_geospatial28', 17)
        }
        for task_id, task_args in TASKS.items():
            mapserver_name, layer_id = task_args
            params = {'mapserver_name': mapserver_name, 'layer_id': layer_id}
            bash_task = BashOperator(task_id=task_id,
                                     bash_command='/copy_gis_layer.sh',
                                     params=params)
            bash_task

    # The Open Data Portal (i.e. CKAN) stores resources at URLs of format
    # `${BASE_URL}/dataset/${DATASET_ID}/resource/${RESOURCE_ID}/download/${FILENAME}`.
    #
    # To find these resource URLs:
    #
    # - find the dataset in the Open Data Portal (for instance, the Toronto Centreline
    #   is at https://open.toronto.ca/dataset/toronto-centreline-tcl/);
    # - open the "For Developers" tab in the carousel;
    # - find the dataset ID listed in `params`;
    # - use this to request `${BASE_URL}/action/package_show?id=${DATASET_ID}`;
    # - in there, look for the URL under `result.resources[].url`.
    with TaskGroup(
            group_id="copy_opendata_shapefiles") as copy_opendata_shapefiles:
        group_id = "copy_opendata_shapefiles"

        TASK_GROUP = {
            'centreline': {
                'resource_url':
                'https://ckanadmin0.intra.prod-toronto.ca/dataset/1d079757-377b-4564-82df-eb5638583bfb/resource/7209841e-e59c-49e4-9205-3b0587f2eea9/download/centreline_wgs84_v2.zip',
                'source_srid': 3857
            },
            'centreline_intersection': {
                'resource_url':
                'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2c83f641-7808-49ba-b80f-7011851d4e27/resource/c2fc0db0-7dcd-4c13-a54c-f39debc441bd/download/intersection-file-wgs84.zip',
                'source_srid': 4326
            }
        }

        for task_id, params in TASK_GROUP.items():
            task_id_extract = '{0}_extract'.format(task_id)

            with TaskGroup(group_id=f'{task_id}'):

                INDEX_OPENDATA = BashOperator(
                    task_id='index_opendata',
                    bash_command='/copy_opendata_shapefiles/index_opendata.sh'.
                    format(task_id=task_id))

                EXTRACT_OPENDATA_SHAPEFILE = BashOperator(
                    task_id=task_id_extract,
                    bash_command=
                    '/copy_opendata_shapefiles/extract_opendata_shapefile.sh',
                    params={
                        'name': task_id,
                        'resource_url': params['resource_url']
                    })

                task_id_load = '{0}_load'.format(task_id)

                LOAD_SHAPEFILE = BashOperator(
                    task_id=task_id_load,
                    bash_command='/copy_opendata_shapefiles/load_shapefile.sh',
                    params={
                        'name': task_id,
                        'source_srid': params['source_srid']
                    })

                EXTRACT_OPENDATA_SHAPEFILE >> LOAD_SHAPEFILE >> INDEX_OPENDATA

    # centreline_conflation_target
    #
    # Normalize the Toronto Centreline into common _conflation target_ and _routing target_
    # views, for use by other pipelines.
    #
    # The conflation target consists of two views `centreline.midblocks`, `centreline.intersections`.
    # The midblocks and intersections in these views are shown on MOVE's map.  When we conflate
    # collisions and traffic studies to the centreline, we only conflate those to centreline features
    # that are in this conflation target.
    #
    # The routing target consists of two views `centreline.routing_vertices`, `centreline.routing_edges`
    # and is a superset of the conflation target.  This exists because the conflation target is not a
    # valid graph (in the graph theory sense); some midblock endpoints refer to intersection IDs that do
    # not correspond to actual intersections.  To fix this, the routing target fills in vertices for
    # those intersection IDs.  When routing corridors between centreline features, we use the routing
    # target, then filter the result down to only those features in the conflation target.
    #
    # This is intended to run after `copy_opendata_shapefiles`.
    with TaskGroup(group_id="centreline_conflation_target"
                   ) as centreline_conflation_target:
        group_id = "centreline_conflation_target"

        A0_INTERSECTIONS_BASE = create_bash_task_nested(
            group_id, 'A0_intersections_base')
        A0_MIDBLOCKS_BASE = create_bash_task_nested(group_id,
                                                    'A0_midblocks_base')
        A1_INTERSECTION_IDS = create_bash_task_nested(group_id,
                                                      'A1_intersection_ids')
        A2_INTERSECTIONS = create_bash_task_nested(group_id,
                                                   "A2_intersections")
        A3_MIDBLOCK_NAMES = create_bash_task_nested(group_id,
                                                    'A3_midblock_names')
        A4_MIDBLOCKS = create_bash_task_nested(group_id, 'A4_midblocks')
        A5_ROUTING_VERTICES = create_bash_task_nested(group_id,
                                                      'A5_routing_vertices')
        A6_ROUTING_EDGES = create_bash_task_nested(group_id,
                                                   'A6_routing_edges')

        [A0_INTERSECTIONS_BASE, A0_MIDBLOCKS_BASE] >> A1_INTERSECTION_IDS
        A1_INTERSECTION_IDS >> A2_INTERSECTIONS
        A2_INTERSECTIONS >> A3_MIDBLOCK_NAMES
        A3_MIDBLOCK_NAMES >> A4_MIDBLOCKS
        A4_MIDBLOCKS >> A5_ROUTING_VERTICES
        A5_ROUTING_VERTICES >> A6_ROUTING_EDGES

    # """
    # gis_layers_vector_tiles
    #
    # Generates vector tiles from GIS layers provided by GCC, which are loaded into our database by
    # the `copy_gis_layers` DAG.  These are stored in `/data/tiles`, and are served from `/tiles` on
    # our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render information
    # about schools, hospitals, and other points of interest when zoomed in.
    #
    # This is intended to run after `copy_gis_layers`.
    # """
    with TaskGroup(
            group_id="gis_layers_vector_tiles") as gis_layers_vector_tiles:
        BUILD_GIS_LAYERS_TILES = create_bash_task('build_gis_layers_tiles')
        EXTRACT_GIS_LAYERS_TILES = create_bash_task('extract_gis_layers_tiles')

        BUILD_GIS_LAYERS_TILES >> EXTRACT_GIS_LAYERS_TILES
    """
    location_search_index

    Builds the views and indexes that support location search, and also builds an index of midblock
    names.

    This is intended to run after `centreline_conflation_target` and `copy_gis_layers`
    """
    with TaskGroup(group_id="location_search_index") as location_search_index:
        group_id = "location_search_index"

        TRANSFORM_CENTRELINE_INDEX = create_bash_task_nested(
            group_id, 'transform_centreline_index')
        TRANSFORM_INTERSECTIONS_INDEX = create_bash_task_nested(
            group_id, 'transform_intersections_index')
        TRANSFORM_TRAFFIC_SIGNAL = create_bash_task_nested(
            group_id, 'transform_traffic_signal')

        TRANSFORM_TRAFFIC_SIGNAL
        TRANSFORM_CENTRELINE_INDEX >> TRANSFORM_INTERSECTIONS_INDEX
    """
    centreline_vector_tiles

    Generates vector tiles from the MOVE conflation target, which is built by the
    `centreline_conflation_target` DAG.  These are stored in `/data/tiles`, and are served from
    `/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render
    interactive centreline features.

    This is intended to run after `centreline_conflation_target`.
    """
    with TaskGroup(
            group_id='centreline_vector_tiles') as centreline_vector_tiles:
        group_id = 'centreline_vector_tiles'

        LOAD_VOLUME = create_bash_task_nested(group_id, 'load_volume')
        BUILD_VECTOR_TILES = create_bash_task_nested(group_id,
                                                     'build_vector_tiles')
        EXTRACT_VECTOR_TILES = create_bash_task_nested(group_id,
                                                       'extract_vector_tiles')

        LOAD_VOLUME >> BUILD_VECTOR_TILES
        BUILD_VECTOR_TILES >> EXTRACT_VECTOR_TILES
    """
    arteries_geocoding

    Uses arterycode matching information and processes as originally developed by Data + Analytics to
    link counts with the Toronto centreline.

    The legacy FLOW system was not based on the Toronto Centreline, but rather used a legacy map
    layer that is no longer supported.  In FLOW, arterycodes identified locations in that legacy
    map layer.  To use these with the Toronto Centreline, we apply a series of heuristics developed
    by Data + Analytics: ID matching on `LINKID`, spatial matches, etc.

    This is the first step in our FLOW geocoding cascade, which continues with the DAGs
    `group_multidirection_arteries` and `group_multiday_counts`.  All three DAGs must run before
    MOVE is considered to have updated its copy of FLOW data.

    This is intended to run after `replicator_transfer_flow` and `centreline_conflation_target`.
    """
    with TaskGroup(group_id="arteries_geocoding") as arteries_geocoding:
        group_id = "arteries_geocoding"

        A1_ARTERIES_MANUAL_CORR = create_bash_task_nested(
            group_id, 'A1_arteries_manual_corr')
        A1_NODES_CORRECTED = create_bash_task_nested(group_id,
                                                     'A1_nodes_corrected')
        A2_NODES_CENTRELINE = create_bash_task_nested(group_id,
                                                      'A2_nodes_centreline')
        B1_ARTERIES_PX_CENTRELINE = create_bash_task_nested(
            group_id, 'B1_arteries_px_centreline')
        B2_ARTERIES_MANUAL_CORR_NORMALIZED = create_bash_task_nested(
            group_id, 'B2_arteries_manual_corr_normalized')
        C1_ARTERIES_LINKS = create_bash_task_nested(group_id,
                                                    'C1_arteries_links')
        C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS = create_bash_task_nested(
            group_id, 'C2_arteries_double_link_midblocks')
        C2_ARTERIES_DOUBLE_NODE = create_bash_task_nested(
            group_id, 'C2_arteries_double_node')
        C2_ARTERIES_SINGLE_NODE = create_bash_task_nested(
            group_id, 'C2_arteries_single_node')
        C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS = create_bash_task_nested(
            group_id, 'C3_arteries_double_node_midblocks')
        C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST = create_bash_task_nested(
            group_id, 'C4_arteries_double_node_midblocks_multi_best')
        D1_ARTERIES_CENTRELINE_TABLE = create_bash_task_nested(
            group_id, 'D1_arteries_centreline_table')
        D2_ARTERY_GEOCODING = create_bash_task_nested(group_id,
                                                      'D2_artery_geocoding')
        D3_ARTERIES_CENTRELINE_VIEW = create_bash_task_nested(
            group_id, 'D3_arteries_centreline_view')

        A1_NODES_CORRECTED >> A2_NODES_CENTRELINE
        A1_ARTERIES_MANUAL_CORR >> B2_ARTERIES_MANUAL_CORR_NORMALIZED
        A2_NODES_CENTRELINE >> C2_ARTERIES_DOUBLE_NODE
        C1_ARTERIES_LINKS >> C2_ARTERIES_DOUBLE_NODE
        A2_NODES_CENTRELINE >> C2_ARTERIES_SINGLE_NODE
        C1_ARTERIES_LINKS >> C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS
        C1_ARTERIES_LINKS >> C2_ARTERIES_SINGLE_NODE
        C2_ARTERIES_DOUBLE_NODE >> C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS
        C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS >> C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST
        A2_NODES_CENTRELINE >> D1_ARTERIES_CENTRELINE_TABLE
        B1_ARTERIES_PX_CENTRELINE >> D1_ARTERIES_CENTRELINE_TABLE
        B2_ARTERIES_MANUAL_CORR_NORMALIZED >> D1_ARTERIES_CENTRELINE_TABLE
        C1_ARTERIES_LINKS >> D1_ARTERIES_CENTRELINE_TABLE
        C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS >> D1_ARTERIES_CENTRELINE_TABLE
        C2_ARTERIES_SINGLE_NODE >> D1_ARTERIES_CENTRELINE_TABLE
        C2_ARTERIES_DOUBLE_NODE >> D1_ARTERIES_CENTRELINE_TABLE
        C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS >> D1_ARTERIES_CENTRELINE_TABLE
        C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST >> D1_ARTERIES_CENTRELINE_TABLE
        D1_ARTERIES_CENTRELINE_TABLE >> D2_ARTERY_GEOCODING
        D2_ARTERY_GEOCODING >> D3_ARTERIES_CENTRELINE_VIEW
    """
    crash_geocoding

    Normalizes CRASH data into collision _events_ and collision _involved persons_, then matches
    collision events to the centreline conflation target that was created by
    `centreline_conflation_target`.

    Our legacy schema in Oracle stores both event-related and involved-person-related information
    in a single table, `TRAFFIC.ACC`.  That table has one record per involved person, with event-level
    details copied across all persons involved in a collision.  To make this easier to work with in
    MOVE, we transform `TRAFFIC.ACC` into a normalized representation.

    To match collisions to the centreline, we use the following heuristic:

    - if there are any intersections within 20m, match to the closest such intersection;
    - otherwise, if there are any midblocks within 20m, match to the closest such midblock;
    - otherwise, do not match.

    This same heuristic was used by the legacy CRASH system to assign collisions to intersections
    and midblocks.  (However, CRASH did not use the Toronto Centreline, but instead used a legacy
    map layer that has been deprecated and is no longer maintained by the City.)

    This is intended to run after `replicator_transfer_crash` and `centreline_conflation_target`.
    """
    with TaskGroup(group_id="crash_geocoding") as crash_geocoding:
        group_id = 'crash_geocoding'

        A1_EVENTS_FIELDS_RAW = create_bash_task_nested(group_id,
                                                       'A1_events_fields_raw')
        A2_EVENTS_FIELDS_NORM = create_bash_task_nested(
            group_id, 'A2_events_fields_norm')
        A2_INVOLVED_FIELDS_RAW = create_bash_task_nested(
            group_id, 'A2_involved_fields_raw')
        A3_INVOLVED_FIELDS_NORM = create_bash_task_nested(
            group_id, 'A3_involved_fields_norm')
        A4_INVOLVED = create_bash_task_nested(group_id, 'A4_involved')
        A5_EVENTS = create_bash_task_nested(group_id, 'A5_events')
        A6_EVENTS_INTERSECTIONS = create_bash_task_nested(
            group_id, 'A6_events_intersections')
        A6_EVENTS_SEGMENTS = create_bash_task_nested(group_id,
                                                     'A6_events_segments')
        A7_EVENTS_CENTRELINE = create_bash_task_nested(group_id,
                                                       'A7_events_centreline')

        A1_EVENTS_FIELDS_RAW >> A2_EVENTS_FIELDS_NORM
        A1_EVENTS_FIELDS_RAW >> A2_INVOLVED_FIELDS_RAW
        A2_EVENTS_FIELDS_NORM >> A3_INVOLVED_FIELDS_NORM
        A2_INVOLVED_FIELDS_RAW >> A3_INVOLVED_FIELDS_NORM
        A3_INVOLVED_FIELDS_NORM >> A4_INVOLVED
        A4_INVOLVED >> A5_EVENTS
        A5_EVENTS >> A6_EVENTS_INTERSECTIONS
        A5_EVENTS >> A6_EVENTS_SEGMENTS
        A6_EVENTS_INTERSECTIONS >> A7_EVENTS_CENTRELINE
        A6_EVENTS_SEGMENTS >> A7_EVENTS_CENTRELINE
    """
    collisions_vector_tiles

    Generates vector tiles from collisions data, which is built by the `crash_geocoding` DAG.
    These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they
    are used by `FcPaneMap` in the web frontend to render collisions heatmaps when zoomed out.

    This is intended to run after `crash_geocoding`.
    """
    with TaskGroup(
            group_id="collisions_vector_tiles") as collisions_vector_tiles:
        group_id = "collisions_vector_tiles"

        BUILD_COLLISIONS_TILES = create_bash_task_nested(
            group_id, 'build_collisions_tiles')
        EXTRACT_COLLISIONS_TILES = create_bash_task_nested(
            group_id, 'extract_collisions_tiles')

        BUILD_COLLISIONS_TILES >> EXTRACT_COLLISIONS_TILES
    """
    group_multidirection_arteries

    Continues the FLOW geocoding process started by `arteries_geocoding`, by identifying arterycodes
    that refer to different directions of travel in the same location and grouping them together.

    When a traffic study is requested, it might ask for 3 days of data collection on a 2-way street;
    someone requesting this study would want to see all 3 days in both directions of travel.
    However, the legacy FLOW schema uses separate arterycodes for different directions of travel,
    and also uses separate `COUNT_INFO_ID`s for each day of a traffic study.

    As a first step towards delivering all data for this study at once, we need to identify the
    arterycodes that correspond to these two directions of travel, and group them together.  Once
    that's done, the DAG `group_multiday_counts` then takes care of grouping together the 3 days
    of the traffic study, so that we can get all six relevant counts in database.

    Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no
    reliable way to visualize that much data at once.

    This is intended to run after `arteries_geocoding`.
    """
    with TaskGroup(group_id='group_multidirection_arteries'
                   ) as group_multidirection_arteries:
        group_id = 'group_multidirection_arteries'

        A1_ARTERIES_DOUBLE_LINK_PAIRS = create_bash_task_nested(
            group_id, 'A1_arteries_double_link_pairs')
        A1_ARTERIES_MIDBLOCK_SOLO = create_bash_task_nested(
            group_id, 'A1_arteries_midblock_solo')
        A2_ARTERIES_GROUPS_PRE = create_bash_task_nested(
            group_id, 'A2_arteries_groups_pre')
        A3_ARTERIES_GROUPS_RANKED = create_bash_task_nested(
            group_id, 'A3_arteries_groups_ranked')
        A4_ARTERIES_GROUPS_POST = create_bash_task_nested(
            group_id, 'A4_arteries_groups_post')

        A1_ARTERIES_DOUBLE_LINK_PAIRS >> A2_ARTERIES_GROUPS_PRE
        A1_ARTERIES_MIDBLOCK_SOLO >> A2_ARTERIES_GROUPS_PRE
        A2_ARTERIES_GROUPS_PRE >> A3_ARTERIES_GROUPS_RANKED
        A3_ARTERIES_GROUPS_RANKED >> A4_ARTERIES_GROUPS_POST
    """
    group_multiday_counts

    Finishes the FLOW geocoding process started by `arteries_geocoding` and continued by
    `group_multidirection_arteries`, by identifying consecutive days of data collection from
    the same arterycode group and grouping those together into a single study.

    When a traffic study is requested, it might ask for 3 days of data collection on a 2-way street;
    someone requesting this study would want to see all 3 days in both directions of travel.
    However, the legacy FLOW schema uses separate arterycodes for different directions of travel,
    and also uses separate `COUNT_INFO_ID`s for each day of a traffic study.

    Once `group_multidirection_arteries` has completed, we've identified the arterycodes that
    correspond to these two directions of travel.  To find all data for the study, we now need to
    group together the 3 days over which data was collected at these two arterycodes.

    However, not all studies are of the same duration.  To detect studies, we use runs of consecutive
    days at the same arterycode group.

    Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no
    reliable way to visualize that much data at once.

    This is intended to run after `group_multidirection_arteries`.
    """
    with TaskGroup(group_id='group_multiday_counts') as group_multiday_counts:
        group_id = 'group_multiday_counts'

        A1_COUNTS_MULTIDAY_RUNS = create_bash_task_nested(
            group_id, 'A1_counts_multiday_runs')
        A2_ARTERIES_COUNTS_GROUPS = create_bash_task_nested(
            group_id, 'A2_arteries_counts_groups')
        A3_STUDIES = create_bash_task_nested(group_id, 'A3_studies')
        A4_COUNTS2_STUDIES = create_bash_task_nested(group_id,
                                                     'A4_counts2_studies')

        A1_COUNTS_MULTIDAY_RUNS >> A2_ARTERIES_COUNTS_GROUPS
        A2_ARTERIES_COUNTS_GROUPS >> A3_STUDIES
        A3_STUDIES >> A4_COUNTS2_STUDIES
    """
    open_data_tmcs

    Builds the [Traffic Volumes at Intersections for All Modes](https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/)
    dataset for the City of Toronto Open Data Portal.

    The dataset is exposed in two ways: via database, and via HTTP.  We store the dataset as a series
    of views in the `open_data` schema.  We also dump those views to CSV files at `/data/open_data`,
    which is served from `/open_data` on our ETL EC2 instances.

    This is intended to run after `group_multiday_counts`.
    """
    with TaskGroup(group_id='open_data_tmcs') as open_data_tmcs:
        group_id = 'open_data_tmcs'

        A1_TMCS_COUNT_DATA = create_bash_task_nested(group_id,
                                                     'A1_tmcs_count_data')
        A1_TMCS_COUNT_METADATA = create_bash_task_nested(
            group_id, 'A1_tmcs_count_metadata')
        A2_TMCS_LOCATIONS = create_bash_task_nested(group_id,
                                                    'A2_tmcs_locations')
        A3_TMCS_JOINED = create_bash_task_nested(group_id, 'A3_tmcs_joined')
        A4_TMCS_DECADES = create_bash_task_nested(group_id, 'A4_tmcs_decades')
        A4_TMCS_PREVIEW = create_bash_task_nested(group_id, 'A4_tmcs_preview')

        A1_TMCS_COUNT_DATA >> A2_TMCS_LOCATIONS
        A1_TMCS_COUNT_METADATA >> A2_TMCS_LOCATIONS
        A2_TMCS_LOCATIONS >> A3_TMCS_JOINED
        A3_TMCS_JOINED >> A4_TMCS_DECADES
        A3_TMCS_JOINED >> A4_TMCS_PREVIEW

    replicator_update_schema >> copy_gis_layers
    replicator_update_schema >> copy_opendata_shapefiles
    [copy_gis_layers, copy_opendata_shapefiles] >> centreline_conflation_target
    [copy_gis_layers, copy_opendata_shapefiles] >> gis_layers_vector_tiles
    centreline_conflation_target >> location_search_index
    centreline_conflation_target >> centreline_vector_tiles
    centreline_conflation_target >> arteries_geocoding
    centreline_conflation_target >> crash_geocoding
    crash_geocoding >> collisions_vector_tiles
    arteries_geocoding >> group_multidirection_arteries
    group_multidirection_arteries >> group_multiday_counts
    group_multiday_counts >> open_data_tmcs
        max_active_runs=3,
        schedule_interval="@daily",
        default_args={
            "email_on_failure": False,
            "email_on_retry": False,
            "retries": 1,
            "retry_delay": timedelta(minutes=1),
        },
        catchup=False,
        template_searchpath="/usr/local/airflow/include",
) as dag:

    t0 = DummyOperator(task_id="start")

    # Define Task Group with Postgres Queries
    with TaskGroup("covid_table_queries") as covid_table_queries:
        for state in states:
            generate_files = PostgresOperator(
                task_id="covid_query_{0}".format(state),
                postgres_conn_id="gpdb",
                sql="covid_state_query.sql",
                params={"state": "'" + state + "'"},
            )

    # Define task to send email
    send_email = EmailOperator(
        task_id="send_email",
        to=email_to,
        subject="Covid Greenplum Queries DAG",
        html_content=
        "<p>The Covid queries were run on Greenplum successfully. <p>",
    default_args=args,
    schedule_interval=None,
    tags=['trigger']
)

task_start = DummyOperator(task_id='start_task', dag=dag)

task_list = [DummyOperator(task_id='task_success_' + str(option), dag=dag) for option in range(1,5)]

def make_skip(**kwargs):
    raise AirflowSkipException("Skip this task and individual downstream tasks while respecting trigger rules.")

def make_fail(**kwargs):
    raise ValueError('Make Error Force')

with TaskGroup("case_group", dag=dag) as case_group:
    task_skipped = PythonOperator(
        task_id='task_skipped',
        provide_context=True,
        python_callable=make_skip,
        dag=dag
    )

    task_failed = PythonOperator(
        task_id='task_failed',
        provide_context=True,
        python_callable=make_fail,
        dag=dag
    )

    task_all_success = DummyOperator(
Пример #7
0
        if accaracy > 2:
            return ['accurate', 'in_accurate']

    return 'in_accurate'


with DAG('xcom_dag',
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    downloading_data = BashOperator(task_id='downloading_data',
                                    bash_command='sleep 3',
                                    do_xcom_push=False)

    with TaskGroup('processing_tasks') as processing_tasks:
        training_model_a = PythonOperator(task_id='training_model_a',
                                          python_callable=_training_model)

        training_model_b = PythonOperator(task_id='training_model_b',
                                          python_callable=_training_model)

        training_model_c = PythonOperator(task_id='training_model_c',
                                          python_callable=_training_model)

    choose_model = BranchPythonOperator(task_id='task_4',
                                        python_callable=_choose_best_model)

    accurate = DummyOperator(task_id='accurate')

    in_accurate = DummyOperator(task_id='in_accurate')
Пример #8
0
def test_sub_dag_task_group():
    """
    Tests dag.sub_dag() updates task_group correctly.
    """
    execution_date = pendulum.parse("20200101")
    with DAG("test_test_task_group_sub_dag", start_date=execution_date) as dag:
        task1 = DummyOperator(task_id="task1")
        with TaskGroup("group234") as group234:
            _ = DummyOperator(task_id="task2")

            with TaskGroup("group34") as group34:
                _ = DummyOperator(task_id="task3")
                _ = DummyOperator(task_id="task4")

        with TaskGroup("group6") as group6:
            _ = DummyOperator(task_id="task6")

        task7 = DummyOperator(task_id="task7")
        task5 = DummyOperator(task_id="task5")

        task1 >> group234
        group34 >> task5
        group234 >> group6
        group234 >> task7

    subdag = dag.sub_dag(task_ids_or_regex="task5",
                         include_upstream=True,
                         include_downstream=False)

    assert extract_node_id(task_group_to_dict(subdag.task_group)) == {
        'id':
        None,
        'children': [
            {
                'id':
                'group234',
                'children': [
                    {
                        'id':
                        'group234.group34',
                        'children': [
                            {
                                'id': 'group234.group34.task3'
                            },
                            {
                                'id': 'group234.group34.task4'
                            },
                            {
                                'id': 'group234.group34.downstream_join_id'
                            },
                        ],
                    },
                    {
                        'id': 'group234.upstream_join_id'
                    },
                ],
            },
            {
                'id': 'task1'
            },
            {
                'id': 'task5'
            },
        ],
    }

    edges = dag_edges(subdag)
    assert sorted((e["source_id"], e["target_id"]) for e in edges) == [
        ('group234.group34.downstream_join_id', 'task5'),
        ('group234.group34.task3', 'group234.group34.downstream_join_id'),
        ('group234.group34.task4', 'group234.group34.downstream_join_id'),
        ('group234.upstream_join_id', 'group234.group34.task3'),
        ('group234.upstream_join_id', 'group234.group34.task4'),
        ('task1', 'group234.upstream_join_id'),
    ]

    subdag_task_groups = subdag.task_group.get_task_group_dict()
    assert subdag_task_groups.keys() == {None, "group234", "group234.group34"}

    included_group_ids = {"group234", "group234.group34"}
    included_task_ids = {
        'group234.group34.task3', 'group234.group34.task4', 'task1', 'task5'
    }

    for task_group in subdag_task_groups.values():
        assert task_group.upstream_group_ids.issubset(included_group_ids)
        assert task_group.downstream_group_ids.issubset(included_group_ids)
        assert task_group.upstream_task_ids.issubset(included_task_ids)
        assert task_group.downstream_task_ids.issubset(included_task_ids)

    for task in subdag.task_group:
        assert task.upstream_task_ids.issubset(included_task_ids)
        assert task.downstream_task_ids.issubset(included_task_ids)

def methodPrint(n):
    print('This is odd ::' + str(n))


def the_end():
    print('The End')


with DAG(dag_id='TaskGroup_BranchPythonOperator',
         schedule_interval=None,
         start_date=days_ago(2)) as dag:
    task_1 = PythonOperator(task_id='task_1', python_callable=method1)
    task_2 = BranchPythonOperator(task_id='task_2', python_callable=method2)
    with TaskGroup('group1') as group1:

        task_x = PythonOperator(task_id='task_x',
                                python_callable=printMethod,
                                op_kwargs={'n': 1})
        task_n = [
            PythonOperator(task_id=f'task_{i}',
                           python_callable=printMethod,
                           op_kwargs={'n': i}) for i in range(2, 6)
        ]
        task_x >> task_n

    with TaskGroup('group2') as group2:

        task_x = PythonOperator(task_id='task_x',
                                python_callable=methodPrint,
        dih_stg
    2. Creates all tables in filter database.
        linqdm_filter
    3. Creates all tables in Fact database.
        linqdm_fdn
    4. Creates all tables in Base database for now this database will have only DIH tables.
        dih
       """  
    
    # Set the batch id from Airflow dag run
    setbatch = getpythonoperator("BatchId", getBatchId)
    batchid = "{{ ti.xcom_pull(key='batchId', task_ids='Run_BatchId') }}"

    for db in database:

        with TaskGroup(group_id="{}_Tab".format(db)) as run_stage0:
            stagetaskgrp = []
            with TaskGroup(group_id="{}_S2HS".format(db)) as run_stage1:
                for tabname in database[db]["tabname"]:

                    taskname = "CRT_{}_{}".format(db, tabname)
                    taskid = 'TA_' + taskname
                    commands = "base64 -d <<< {} | kinit {}@{} && ssh -o StrictHostKeyChecking=no -o GSSAPIAuthentication=yes -oGSSAPIDelegateCredentials=yes {}@{} '{}'".format(password, kinitprincipal, kinitdomain, kinitprincipal, edgenodehost, "{} {}  {} {} {} {}".format(scriptpaths["hiveload"], tabname , batchid, 'ddl', db, database[db]["type"]))
                    ssh_create_stage = getbashoperator(taskname, False, commands)
                    ssh_create_stage
                    stagetaskgrp.append(run_stage1)
            run_stage1
        group.append(run_stage0) 

    dummyop = DummyOperator(task_id='NoOP')
Пример #11
0
with DAG(
        'covid_data_to_s3',
        start_date=datetime(2019, 1, 1),
        max_active_runs=1,
        # https://airflow.apache.org/docs/stable/scheduler.html#dag-runs
        schedule_interval='@daily',
        default_args=default_args,
        catchup=False  # enable if you don't want historical dag runs to run
) as dag:

    t0 = DummyOperator(task_id='start')

    send_email = EmailOperator(
        task_id='send_email',
        to=email_to,
        subject='Covid to S3 DAG',
        html_content=
        '<p>The Covid to S3 DAG completed successfully. Files can now be found on S3. <p>'
    )

    with TaskGroup('covid_task_group') as covid_group:
        for endpoint in endpoints:
            generate_files = PythonOperator(
                task_id='generate_file_{0}'.format(endpoint),
                python_callable=upload_to_s3,
                op_kwargs={
                    'endpoint': endpoint,
                    'date': date
                })

    t0 >> covid_group >> send_email
Пример #12
0
DAG_ID = os.path.basename(__file__).replace('.py', '')

DEFAULT_ARGS = {
    'owner': 'airflow',
    'depends_on_past': False,
    'email': ['*****@*****.**'],
    'email_on_failure': False,
    'email_on_retry': False,
}

dag_args = {
    "dag_id": DAG_ID,
    "description": 'Run built-in Spark app on Amazon EMR',
    "default_args": DEFAULT_ARGS,
    "dagrun_timeout": timedelta(hours=2),
    "start_date": datetime(2020, 1, 1),
    "schedule_interval": '@once',
    "tags": ['emr'],
}

with DAG(**dag_args) as dag:
    task_1 = BashOperator(task_id='task_1', bash_command='sleep 3')
    with TaskGroup('processing_tasks') as parallel_task:
        task_2 = BashOperator(task_id='task_2', bash_command='sleep 3')
        task_3 = BashOperator(task_id='task_3', bash_command='sleep 3')

    task_4 = BashOperator(task_id='task_4', bash_command='sleep 3')

    task_1 >> parallel_task >> task_4
}

with DAG(
        dag_id="example_from_home_estate_nybolig_boliga",
        description="Populate data from home.dk estate.dk and nybolig.dk",
        default_args=args,
        schedule_interval="@daily",
        start_date=datetime(2021, 5, 1, 22, 45),
        catchup=False,
        max_active_runs=4,
        tags=["estate_data"],
) as dag:

    start = DummyOperator(task_id="start")

    with TaskGroup("home", tooltip="Tasks for Home") as home:
        home_scraper_section()

    estate = ScrapEstateOperator(
        task_id="estate",
        url="https://www.estate.dk/Services/PropertySearch/Search",
        api_name="estate.dk",
        scraper_cls=Estate,
        params=params,
    )

    nybolig = ScrapEstateOperator(
        task_id="nybolig",
        url="https://www.nybolig.dk/Services/PropertySearch/Search",
        api_name="nybolig.dk",
        scraper_cls=Nybolig,
Пример #14
0
from airflow.models.dag import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
from airflow.operators.python import PythonOperator
from airflow.utils.dates import days_ago
from airflow.utils.task_group import TaskGroup

from dags.dag01.model import load_dump, querys_no_return, querys_with_return

with DAG(dag_id="dag01", start_date=days_ago(1), tags=["test"]) as dag:

    start = PythonOperator(task_id="start",
                           trigger_rule='all_success',
                           python_callable=querys_with_return)

    with TaskGroup("section_1", tooltip="pipeline xpto") as section_1:
        task_load_dump = PythonOperator(task_id="task_load_dump",
                                        trigger_rule='all_success',
                                        python_callable=load_dump)

        task_querys_with_return = PythonOperator(
            task_id="task_querys_with_return",
            trigger_rule='all_success',
            python_callable=querys_with_return)

        task_querys_no_return = PythonOperator(
            task_id="task_querys_no_return",
            trigger_rule='all_success',
            python_callable=querys_no_return)

        task_load_dump >> [task_querys_with_return, task_querys_no_return]
Пример #15
0
with DAG(
    dag_id="demo_xcom",
    description="This is a DAG to demo how x_com works",
    catchup=False,
    max_active_runs=1,
    schedule_interval=timedelta(days=1),
    default_args=default_args,
) as dag:
    task_1 = BashOperator(
        dag=dag,
        task_id="task_1",
        do_xcom_push=False,
        bash_command="sleep 2; echo This is Task 1",
    )

    with TaskGroup("processing_tasks") as processing:
        task_2 = PythonOperator(
            dag=dag, task_id="task_2", python_callable=_return_in_default
        )
        task_3 = PythonOperator(
            dag=dag, task_id="task_3", python_callable=_return_via_ti
        )
        task_4 = PythonOperator(
            dag=dag, task_id="task_4", python_callable=_return_via_ti
        )

    task_5 = PythonOperator(
        dag=dag, task_id="task_5", python_callable=_pick_out_smaller
    )

    task_1 >> processing >> task_5
Пример #16
0
def test_build_task_group_with_prefix():
    """
    Tests that prefix_group_id turns on/off prefixing of task_id with group_id.
    """
    execution_date = pendulum.parse("20200101")
    with DAG("test_build_task_group_with_prefix",
             start_date=execution_date) as dag:
        task1 = DummyOperator(task_id="task1")
        with TaskGroup("group234", prefix_group_id=False) as group234:
            task2 = DummyOperator(task_id="task2")

            with TaskGroup("group34") as group34:
                task3 = DummyOperator(task_id="task3")

                with TaskGroup("group4", prefix_group_id=False) as group4:
                    task4 = DummyOperator(task_id="task4")

        task5 = DummyOperator(task_id="task5")
        task1 >> group234
        group34 >> task5

    assert task2.task_id == "task2"
    assert group34.group_id == "group34"
    assert task3.task_id == "group34.task3"
    assert group4.group_id == "group34.group4"
    assert task4.task_id == "task4"
    assert task5.task_id == "task5"
    assert group234.get_child_by_label("task2") == task2
    assert group234.get_child_by_label("group34") == group34
    assert group4.get_child_by_label("task4") == task4

    assert extract_node_id(
        task_group_to_dict(dag.task_group), include_label=True) == {
            'id':
            None,
            'label':
            None,
            'children': [
                {
                    'id':
                    'group234',
                    'label':
                    'group234',
                    'children': [
                        {
                            'id':
                            'group34',
                            'label':
                            'group34',
                            'children': [
                                {
                                    'id':
                                    'group34.group4',
                                    'label':
                                    'group4',
                                    'children': [{
                                        'id': 'task4',
                                        'label': 'task4'
                                    }],
                                },
                                {
                                    'id': 'group34.task3',
                                    'label': 'task3'
                                },
                                {
                                    'id': 'group34.downstream_join_id',
                                    'label': ''
                                },
                            ],
                        },
                        {
                            'id': 'task2',
                            'label': 'task2'
                        },
                        {
                            'id': 'group234.upstream_join_id',
                            'label': ''
                        },
                    ],
                },
                {
                    'id': 'task1',
                    'label': 'task1'
                },
                {
                    'id': 'task5',
                    'label': 'task5'
                },
            ],
        }
Пример #17
0
def test_build_task_group_with_task_decorator():
    """
    Test that TaskGroup can be used with the @task decorator.
    """
    from airflow.operators.python import task

    @task
    def task_1():
        print("task_1")

    @task
    def task_2():
        return "task_2"

    @task
    def task_3():
        return "task_3"

    @task
    def task_4(task_2_output, task_3_output):
        print(task_2_output, task_3_output)

    @task
    def task_5():
        print("task_5")

    execution_date = pendulum.parse("20200101")
    with DAG("test_build_task_group_with_task_decorator",
             start_date=execution_date) as dag:
        tsk_1 = task_1()

        with TaskGroup("group234") as group234:
            tsk_2 = task_2()
            tsk_3 = task_3()
            tsk_4 = task_4(tsk_2, tsk_3)

        tsk_5 = task_5()

        tsk_1 >> group234 >> tsk_5

    # pylint: disable=no-member
    assert tsk_1.operator in tsk_2.operator.upstream_list
    assert tsk_1.operator in tsk_3.operator.upstream_list
    assert tsk_5.operator in tsk_4.operator.downstream_list
    # pylint: enable=no-member

    assert extract_node_id(task_group_to_dict(dag.task_group)) == {
        'id':
        None,
        'children': [
            {
                'id':
                'group234',
                'children': [
                    {
                        'id': 'group234.task_2'
                    },
                    {
                        'id': 'group234.task_3'
                    },
                    {
                        'id': 'group234.task_4'
                    },
                    {
                        'id': 'group234.upstream_join_id'
                    },
                    {
                        'id': 'group234.downstream_join_id'
                    },
                ],
            },
            {
                'id': 'task_1'
            },
            {
                'id': 'task_5'
            },
        ],
    }

    edges = dag_edges(dag)
    assert sorted((e["source_id"], e["target_id"]) for e in edges) == [
        ('group234.downstream_join_id', 'task_5'),
        ('group234.task_2', 'group234.task_4'),
        ('group234.task_3', 'group234.task_4'),
        ('group234.task_4', 'group234.downstream_join_id'),
        ('group234.upstream_join_id', 'group234.task_2'),
        ('group234.upstream_join_id', 'group234.task_3'),
        ('task_1', 'group234.upstream_join_id'),
    ]
Пример #18
0
from airflow import DAG
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.bash_operator import BashOperator
from airflow.utils.task_group import TaskGroup
from airflow.utils.dates import days_ago

from subdag_factory import subdag_factory

default_args = {'start_date': days_ago(1)}

with DAG('taskgroup',
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    extracting = DummyOperator(task_id='extracting')

    with TaskGroup('processing_task_group') as processing_group:
        for l in ['A', 'B', 'C']:
            BashOperator(task_id=f'processing_{l}', bash_command='ls')

    loading = DummyOperator(task_id='loading')

    extracting >> processing_group >> loading
Пример #19
0
def test_dag_edges():
    execution_date = pendulum.parse("20200101")
    with DAG("test_dag_edges", start_date=execution_date) as dag:
        task1 = DummyOperator(task_id="task1")
        with TaskGroup("group_a") as group_a:
            with TaskGroup("group_b") as group_b:
                task2 = DummyOperator(task_id="task2")
                task3 = DummyOperator(task_id="task3")
                task4 = DummyOperator(task_id="task4")
                task2 >> [task3, task4]

            task5 = DummyOperator(task_id="task5")

            task5 << group_b

        task1 >> group_a

        with TaskGroup("group_c") as group_c:
            task6 = DummyOperator(task_id="task6")
            task7 = DummyOperator(task_id="task7")
            task8 = DummyOperator(task_id="task8")
            [task6, task7] >> task8
            group_a >> group_c

        task5 >> task8

        task9 = DummyOperator(task_id="task9")
        task10 = DummyOperator(task_id="task10")

        group_c >> [task9, task10]

        with TaskGroup("group_d") as group_d:
            task11 = DummyOperator(task_id="task11")
            task12 = DummyOperator(task_id="task12")
            task11 >> task12

        group_d << group_c

    nodes = task_group_to_dict(dag.task_group)
    edges = dag_edges(dag)

    assert extract_node_id(nodes) == {
        'id':
        None,
        'children': [
            {
                'id':
                'group_a',
                'children': [
                    {
                        'id':
                        'group_a.group_b',
                        'children': [
                            {
                                'id': 'group_a.group_b.task2'
                            },
                            {
                                'id': 'group_a.group_b.task3'
                            },
                            {
                                'id': 'group_a.group_b.task4'
                            },
                            {
                                'id': 'group_a.group_b.downstream_join_id'
                            },
                        ],
                    },
                    {
                        'id': 'group_a.task5'
                    },
                    {
                        'id': 'group_a.upstream_join_id'
                    },
                    {
                        'id': 'group_a.downstream_join_id'
                    },
                ],
            },
            {
                'id':
                'group_c',
                'children': [
                    {
                        'id': 'group_c.task6'
                    },
                    {
                        'id': 'group_c.task7'
                    },
                    {
                        'id': 'group_c.task8'
                    },
                    {
                        'id': 'group_c.upstream_join_id'
                    },
                    {
                        'id': 'group_c.downstream_join_id'
                    },
                ],
            },
            {
                'id':
                'group_d',
                'children': [
                    {
                        'id': 'group_d.task11'
                    },
                    {
                        'id': 'group_d.task12'
                    },
                    {
                        'id': 'group_d.upstream_join_id'
                    },
                ],
            },
            {
                'id': 'task1'
            },
            {
                'id': 'task10'
            },
            {
                'id': 'task9'
            },
        ],
    }

    assert sorted((e["source_id"], e["target_id"]) for e in edges) == [
        ('group_a.downstream_join_id', 'group_c.upstream_join_id'),
        ('group_a.group_b.downstream_join_id', 'group_a.task5'),
        ('group_a.group_b.task2', 'group_a.group_b.task3'),
        ('group_a.group_b.task2', 'group_a.group_b.task4'),
        ('group_a.group_b.task3', 'group_a.group_b.downstream_join_id'),
        ('group_a.group_b.task4', 'group_a.group_b.downstream_join_id'),
        ('group_a.task5', 'group_a.downstream_join_id'),
        ('group_a.task5', 'group_c.task8'),
        ('group_a.upstream_join_id', 'group_a.group_b.task2'),
        ('group_c.downstream_join_id', 'group_d.upstream_join_id'),
        ('group_c.downstream_join_id', 'task10'),
        ('group_c.downstream_join_id', 'task9'),
        ('group_c.task6', 'group_c.task8'),
        ('group_c.task7', 'group_c.task8'),
        ('group_c.task8', 'group_c.downstream_join_id'),
        ('group_c.upstream_join_id', 'group_c.task6'),
        ('group_c.upstream_join_id', 'group_c.task7'),
        ('group_d.task11', 'group_d.task12'),
        ('group_d.upstream_join_id', 'group_d.task11'),
        ('task1', 'group_a.upstream_join_id'),
    ]
Пример #20
0
def create_dag(dag_id, schedule, window, default_args):
    with DAG(
            dag_id,
            default_args=default_args,
            description='creates sliding windows based on months',
            schedule_interval=schedule,
            start_date=datetime.datetime(2021, 4, 30),
            on_failure_callback=dag_fail_slack_alert,
            on_success_callback=dag_success_slack_alert,
            tags=['selection', 'sliding'],
    ) as dag:

        OUTPUT_DIR = WORKING_DIR + "/data/sliding-windows-bealign/" + '_'.join(
            window)
        default_args["params"]["output-dir"] = OUTPUT_DIR
        default_args["params"][
            "meta-output"] = OUTPUT_DIR + '/master-no-sequences.json'
        default_args["params"]["sequence-output"] = OUTPUT_DIR + '/sequences'

        with open(dag.params["region_cfg"], 'r') as stream:
            regions = yaml.safe_load(stream)

        mk_dir_task = BashOperator(
            task_id='make_directory',
            bash_command='mkdir -p {{params.output}}',
            params={'output': default_args['params']['output-dir']},
            dag=dag,
        )

        export_meta_task = PythonOperator(
            task_id='export_meta',
            python_callable=export_meta,
            op_kwargs={"config": default_args['params']},
            pool='mongo',
            dag=dag,
        )

        export_meta_task.set_upstream(mk_dir_task)
        export_sequences_task = PythonOperator(
            task_id='export_sequences',
            python_callable=export_sequences,
            op_kwargs={"config": default_args['params']},
            pool='mongo',
            dag=dag,
        )

        export_sequences_task.set_upstream(mk_dir_task)

        # For each region
        export_by_gene = []

        for gene in regions.keys():

            filepath_prefix = OUTPUT_DIR + '/sequences.' + gene

            nuc_sequence_output = filepath_prefix + '_nuc.fas'
            uniques_fn = filepath_prefix + '_nuc.uniques.fas'
            duplicate_output = filepath_prefix + '.duplicates.json'

            variants_csv_output = filepath_prefix + '.variants.csv'
            variants_json_output = filepath_prefix + '.variants.json'
            filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas'
            filtered_json_output = filepath_prefix + '.filtered.json'
            output_edits_fn = filepath_prefix + '.filtered.edits.json'

            compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json'

            tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree'
            sto_output = filepath_prefix + '.compressed.filtered.sto'

            slac_output_fn = filepath_prefix + '.SLAC.json'
            fel_output_fn = filepath_prefix + '.FEL.json'
            meme_output_fn = filepath_prefix + '.MEME.json'

            summary_output_fn = filepath_prefix + '.json'

            default_args["params"]["nuc-sequence-output"] = nuc_sequence_output
            default_args["params"]["duplicate-output"] = duplicate_output

            with TaskGroup(f"alignment_{gene}") as alignment:

                export_bealign_task = PythonOperator(
                    task_id=f'export_bealign',
                    python_callable=export_bealign_sequences,
                    op_kwargs={
                        "config": default_args['params'],
                        'nuc_output_fn': nuc_sequence_output,
                        'gene': gene
                    },
                    dag=dag,
                )

                # Occasional errors when cleaning up tmp files, so or'ing true
                cleanup_task = BashOperator(
                    task_id=f'cleanup',
                    bash_command=
                    "sed -i '/^>/! s/[^ACTG-]/N/g' $NUC_OUTPUT_FN || true",
                    env={
                        'NUC_OUTPUT_FN': nuc_sequence_output,
                        **os.environ
                    },
                    dag=dag)

                export_bealign_task >> cleanup_task

            with TaskGroup(f"duplicates_{gene}") as duplicates_group:

                compute_duplicates_task = PythonOperator(
                    task_id=f'write_raw_duplicates',
                    python_callable=write_nuc_raw_duplicates,
                    op_kwargs={
                        "input": nuc_sequence_output,
                        "duplicate_output": duplicate_output,
                        'uniques_output': uniques_fn
                    },
                    dag=dag,
                )

                compute_duplicates_task

            # $HYPHY LIBPATH=$HYPHYLIBPATH $COMPRESSOR --msa ${FILE}.${GENE}.compressed.fas --regexp "epi_isl_([0-9]+)" --duplicates ${FILE}.${GENE}.duplicates.json --output ${FILE}.${GENE}.variants.csv --json ${FILE}.${GENE}.variants.json --duplicate-out ${FILE}.${GENE}.duplicates.variants.json

            with TaskGroup(f"filter_{gene}") as filter:
                COMPRESSOR = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor }} --msa $FASTA_FN --regexp "epi_isl_([0-9]+)" --duplicates $DUPLICATE_FN --output $VARIANTS_CSV_FN  --json $VARIANTS_JSON_FN --duplicate-out $COMPRESSOR_DUPLICATE_OUT
                """
                compressor_task = BashOperator(task_id=f'compressor',
                                               bash_command=COMPRESSOR,
                                               env={
                                                   'FASTA_FN': uniques_fn,
                                                   'DUPLICATE_FN':
                                                   duplicate_output,
                                                   'VARIANTS_CSV_FN':
                                                   variants_csv_output,
                                                   'VARIANTS_JSON_FN':
                                                   variants_json_output,
                                                   'COMPRESSOR_DUPLICATE_OUT':
                                                   compressor_duplicate_out,
                                                   **os.environ
                                               },
                                               dag=dag)

                # --output-edits ${FILE}.${GENE}.filtered.edits.json
                COMPRESSOR2 = """
                {{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} {{ params.compressor2 }} --msa $FASTA_FN --duplicates $DUPLICATE_FN --csv $VARIANTS_CSV_FN  --byseq $VARIANTS_JSON_FN --p 0.95 --output $FILTERED_FASTA_FN --json $FILTERED_JSON_FN --output-edits ${OUTPUT_EDITS}
                """
                compressor_two_task = BashOperator(
                    task_id=f'compressor_two',
                    bash_command=COMPRESSOR2,
                    env={
                        'FASTA_FN': uniques_fn,
                        'DUPLICATE_FN': compressor_duplicate_out,
                        'VARIANTS_CSV_FN': variants_csv_output,
                        'VARIANTS_JSON_FN': variants_json_output,
                        'FILTERED_FASTA_FN': filtered_fasta_output,
                        'FILTERED_JSON_FN': filtered_json_output,
                        'OUTPUT_EDITS': output_edits_fn,
                        **os.environ
                    },
                    dag=dag)

                compressor_task >> compressor_two_task

            INFER_TREE = """
            seqmagick convert $FILTERED_FASTA_FN $STO_OUTPUT;
            rapidnj $STO_OUTPUT -i sth > $TREE_OUTPUT
            sed -i "s/'//g" $TREE_OUTPUT;
            """

            infer_tree_task = BashOperator(task_id=f'infer_tree_{gene}',
                                           bash_command=INFER_TREE,
                                           env={
                                               'FILTERED_FASTA_FN':
                                               filtered_fasta_output,
                                               'STO_OUTPUT': sto_output,
                                               'TREE_OUTPUT': tree_output,
                                               **os.environ
                                           },
                                           dag=dag)

            slac_task = BashOperator(
                task_id=f'slac_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} slac --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches All --samples 0 --output $SLAC_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'SLAC_OUTPUT': slac_output_fn,
                    **os.environ
                },
                dag=dag,
            )

            big_data_flags = '--full-model No'

            fel_task = BashOperator(
                task_id=f'fel_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} fel --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $FEL_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'FEL_OUTPUT': fel_output_fn,
                    'BIG_DATA_FLAGS': big_data_flags,
                    **os.environ
                },
                dag=dag,
            )

            meme_task = BashOperator(
                task_id=f'meme_{gene}',
                bash_command=
                "{{ params.hyphy }} LIBPATH={{params.hyphy_lib_path}} meme --kill-zero-lengths Constrain ENV='_DO_TREE_REBALANCE_=1' $BIG_DATA_FLAGS --alignment $FILTERED_FASTA_FN --tree $TREE_OUTPUT --branches Internal --output $MEME_OUTPUT",
                env={
                    'FILTERED_FASTA_FN': filtered_fasta_output,
                    'TREE_OUTPUT': tree_output,
                    'MEME_OUTPUT': meme_output_fn,
                    'BIG_DATA_FLAGS': big_data_flags,
                    **os.environ
                },
                dag=dag,
            )

            annotation_file = filepath_prefix + '.annotation.json'
            copy_annotation_task = BashOperator(
                task_id=f'copy_annotation_{gene}',
                bash_command=
                'cp {{params.working_dir}}/data/comparative-annotation.json {{params.annotation_file}}',
                params={
                    'annotation_file': annotation_file,
                    'working_dir': WORKING_DIR
                },
                dag=dag)

            summarize_gene_task = BashOperator(
                task_id=f'summarize_gene_{gene}',
                bash_command=
                '{{ params.python }} {{params.working_dir}}/python/summarize_gene.py -T {{params.working_dir}}/data/ctl/epitopes.json -B {{params.working_dir}}/data/single_mut_effects.csv -D $MASTERNOFASTA -d $DUPLICATES -s $SLAC_OUTPUT -f $FEL_OUTPUT -m $MEME_OUTPUT -P 0.1 --output  $SUMMARY_OUTPUT -c $COMPRESSED_OUTPUT_FN -E {{params.working_dir}}/data/evo_annotation.json -A {{params.working_dir}}/data/mafs.csv -V {{params.working_dir}}/data/evo_freqs.csv -F $FRAGMENT --frame_shift $ADDSHIFT --fragment_shift $SHIFT -S $OFFSET -O $ANNOTATION',
                params={
                    'python': default_args['params']['python'],
                    'working_dir': WORKING_DIR
                },
                env={
                    'MASTERNOFASTA': default_args["params"]["meta-output"],
                    'DUPLICATES': duplicate_output,
                    'SLAC_OUTPUT': slac_output_fn,
                    'FEL_OUTPUT': fel_output_fn,
                    'MEME_OUTPUT': meme_output_fn,
                    'SUMMARY_OUTPUT': summary_output_fn,
                    'COMPRESSED_OUTPUT_FN': filtered_fasta_output,
                    'FRAGMENT': str(regions[gene]['fragment']),
                    'ADDSHIFT': str(regions[gene]['add_one']),
                    'SHIFT': str(regions[gene]['shift']),
                    'OFFSET': str(regions[gene]['offset']),
                    'ANNOTATION': annotation_file,
                    **os.environ
                },
                dag=dag,
            )

            summarize_gene_task.set_upstream(export_meta_task)
            alignment.set_upstream(export_sequences_task)
            export_by_gene.append(
                alignment >> duplicates_group >> filter >> infer_tree_task >> [
                    slac_task, fel_task, meme_task
                ] >> copy_annotation_task >> summarize_gene_task)

        dag.doc_md = __doc__

        # Add export meta and export sequence tasks to be executed in parallel
        cross_downstream([export_meta_task, export_sequences_task],
                         export_by_gene)

        return dag
from airflow.operators.bash import BashOperator
from airflow.operators.subdag import SubDagOperator
from datetime import datetime
from airflow.utils.task_group import TaskGroup

import json
from subdags.subdag_parallel_dag import subdag_parallel_dag

default_args = {'start_date': datetime(2020, 1, 1)}

with DAG('parallel_task_group',
         schedule_interval='@daily',
         default_args=default_args,
         catchup=False) as dag:

    task_1 = BashOperator(task_id='task_1', bash_command="sleep 3")

    with TaskGroup("processing_task") as processing_task:
        task_2 = BashOperator(task_id='task_2', bash_command="sleep 3")

        with TaskGroup("Spark_task") as Spark_task:
            task_2 = BashOperator(task_id='task_2', bash_command="sleep 3")

        with TaskGroup("flink_task") as flink_task:

            task_3 = BashOperator(task_id='task_3', bash_command="sleep 3")

    task_4 = BashOperator(task_id='task_4', bash_command="sleep 3")

    task_1 >> processing_task >> task_4
Пример #22
0
            uniques_fn = filepath_prefix + '.uniques.fas'
            duplicate_output = filepath_prefix + '.duplicates.json'

            variants_csv_output = filepath_prefix + '.variants.csv'
            variants_json_output = filepath_prefix + '.variants.json'
            filtered_fasta_output = filepath_prefix + '.compressed.filtered.fas'
            filtered_json_output = filepath_prefix + '.filtered.json'
            output_edits_fn = filepath_prefix + '.filtered.edits.json'
            tn93_output = filepath_prefix + '.tn93.csv'

            compressor_duplicate_out = filepath_prefix + '.duplicates.variants.json'

            tree_output = filepath_prefix + '.compressed.filtered.fas.rapidnj.bestTree'
            sto_output = filepath_prefix + '.compressed.filtered.sto'

            with TaskGroup(f"alignment_{month_str}_{gene}") as alignment:

                # Occasional errors when cleaning up tmp files, so or'ing true
                cleanup_task = BashOperator(
                    task_id=f'cleanup',
                    bash_command=
                    "sed -i '/^>/! s/[^ACTG-]/N/g' $NUC_OUTPUT_FN || true",
                    env={
                        'NUC_OUTPUT_FN': nuc_sequence_output,
                        **os.environ
                    },
                    dag=dag)

                cleanup_task

            with TaskGroup(
}

# --------------------------------------------------------------------------------
# Main DAG
# --------------------------------------------------------------------------------

with models.DAG('delete_tables_dag',
                default_args=default_args,
                schedule_interval=None) as dag:
    start = dummy.DummyOperator(task_id='start', trigger_rule='all_success')

    end = dummy.DummyOperator(task_id='end', trigger_rule='all_success')

    # Bigquery Tables deleted here for demo porpuse.
    # Consider a dedicated pipeline or tool for a real life scenario.
    with TaskGroup('delete_table') as delte_table:
        delete_table_customers = BigQueryDeleteTableOperator(
            task_id="delete_table_customers",
            deletion_dataset_table=DWH_LAND_PRJ + "." + DWH_LAND_BQ_DATASET +
            ".customers",
            impersonation_chain=[TRF_SA_DF])

        delete_table_purchases = BigQueryDeleteTableOperator(
            task_id="delete_table_purchases",
            deletion_dataset_table=DWH_LAND_PRJ + "." + DWH_LAND_BQ_DATASET +
            ".purchases",
            impersonation_chain=[TRF_SA_DF])

        delete_table_customer_purchase_curated = BigQueryDeleteTableOperator(
            task_id="delete_table_customer_purchase_curated",
            deletion_dataset_table=DWH_CURATED_PRJ + "." +
Пример #24
0
from subdags.subdag_parallel_dag import subdag_parallel_dag
from airflow.utils.task_group import TaskGroup 
default_args = {
    'start_date': datetime(2020, 1, 1)
}

with DAG ('parallel_dag', schedule_interval='@daily', 
        default_args=default_args, catchup=False) as dag:
    
    task1 = BashOperator(
        task_id='task1',
        bash_command='sleep 3'
    
    )

    with TaskGroup('processing_tasks') as processing_tasks:
        task2 = BashOperator(
            task_id='task2',
            bash_command='sleep 3'    
        )
        with TaskGroup('spark_tasks') as spark_tasks:
            task3 = BashOperator(
                task_id='task3',
                bash_command='sleep 3'        
            )

#    processing = SubDagOperator(
#         task_id = 'processing_tasks',
#         subdag=subdag_parallel_dag('parallel_dag','processing_tasks',default_args)
#     )   
 
Пример #25
0
    def deserialize_dag(cls, encoded_dag: Dict[str, Any]) -> 'SerializedDAG':
        """Deserializes a DAG from a JSON object."""
        dag = SerializedDAG(dag_id=encoded_dag['_dag_id'])

        for k, v in encoded_dag.items():
            if k == "_downstream_task_ids":
                v = set(v)
            elif k == "tasks":

                SerializedBaseOperator._load_operator_extra_links = cls._load_operator_extra_links

                v = {
                    task["task_id"]:
                    SerializedBaseOperator.deserialize_operator(task)
                    for task in v
                }
                k = "task_dict"
            elif k == "timezone":
                v = cls._deserialize_timezone(v)
            elif k == "dagrun_timeout":
                v = cls._deserialize_timedelta(v)
            elif k.endswith("_date"):
                v = cls._deserialize_datetime(v)
            elif k == "edge_info":
                # Value structure matches exactly
                pass
            elif k == "timetable":
                v = _decode_timetable(v)
            elif k in cls._decorated_fields:
                v = cls._deserialize(v)
            elif k == "params":
                v = cls._deserialize_params_dict(v)
            # else use v as it is

            setattr(dag, k, v)

        # A DAG is always serialized with only one of schedule_interval and
        # timetable. This back-populates the other to ensure the two attributes
        # line up correctly on the DAG instance.
        if "timetable" in encoded_dag:
            dag.schedule_interval = dag.timetable.summary
        else:
            dag.timetable = create_timetable(dag.schedule_interval,
                                             dag.timezone)

        # Set _task_group

        if "_task_group" in encoded_dag:
            dag._task_group = SerializedTaskGroup.deserialize_task_group(  # type: ignore
                encoded_dag["_task_group"], None, dag.task_dict)
        else:
            # This must be old data that had no task_group. Create a root TaskGroup and add
            # all tasks to it.
            dag._task_group = TaskGroup.create_root(dag)
            for task in dag.tasks:
                dag.task_group.add(task)

        # Set has_on_*_callbacks to True if they exist in Serialized blob as False is the default
        if "has_on_success_callback" in encoded_dag:
            dag.has_on_success_callback = True
        if "has_on_failure_callback" in encoded_dag:
            dag.has_on_failure_callback = True

        keys_to_set_none = dag.get_serialized_fields() - encoded_dag.keys(
        ) - cls._CONSTRUCTOR_PARAMS.keys()
        for k in keys_to_set_none:
            setattr(dag, k, None)

        for task in dag.task_dict.values():
            task.dag = dag
            serializable_task: BaseOperator = task

            for date_attr in ["start_date", "end_date"]:
                if getattr(serializable_task, date_attr) is None:
                    setattr(serializable_task, date_attr,
                            getattr(dag, date_attr))

            if serializable_task.subdag is not None:
                setattr(serializable_task.subdag, 'parent_dag', dag)
                serializable_task.subdag.is_subdag = True

            for task_id in serializable_task.downstream_task_ids:
                # Bypass set_upstream etc here - it does more than we want

                dag.task_dict[task_id]._upstream_task_ids.add(
                    serializable_task.task_id)

        return dag
Пример #26
0
 def _add_benchmarks(self, task_group):
     with TaskGroup(task_group, prefix_group_id=True, dag=self.dag) as benchmarks:
         benchmark_tasks = self._get_e2e_benchmarks(task_group).get_benchmarks()
         chain(*benchmark_tasks)
     return benchmarks
Пример #27
0
    def deserialize_dag(cls, encoded_dag: Dict[str, Any]) -> 'SerializedDAG':
        """Deserializes a DAG from a JSON object."""
        dag = SerializedDAG(dag_id=encoded_dag['_dag_id'])

        for k, v in encoded_dag.items():
            if k == "_downstream_task_ids":
                v = set(v)
            elif k == "tasks":
                # pylint: disable=protected-access
                SerializedBaseOperator._load_operator_extra_links = cls._load_operator_extra_links
                # pylint: enable=protected-access
                v = {
                    task["task_id"]:
                    SerializedBaseOperator.deserialize_operator(task)
                    for task in v
                }
                k = "task_dict"
            elif k == "timezone":
                v = cls._deserialize_timezone(v)
            elif k in {"dagrun_timeout"}:
                v = cls._deserialize_timedelta(v)
            elif k.endswith("_date"):
                v = cls._deserialize_datetime(v)
            elif k == "edge_info":
                # Value structure matches exactly
                pass
            elif k in cls._decorated_fields:
                v = cls._deserialize(v)
            # else use v as it is

            setattr(dag, k, v)

        # Set _task_group
        # pylint: disable=protected-access
        if "_task_group" in encoded_dag:
            dag._task_group = SerializedTaskGroup.deserialize_task_group(  # type: ignore
                encoded_dag["_task_group"], None, dag.task_dict)
        else:
            # This must be old data that had no task_group. Create a root TaskGroup and add
            # all tasks to it.
            dag._task_group = TaskGroup.create_root(dag)
            for task in dag.tasks:
                dag.task_group.add(task)
        # pylint: enable=protected-access

        # Set has_on_*_callbacks to True if they exist in Serialized blob as False is the default
        if "has_on_success_callback" in encoded_dag:
            dag.has_on_success_callback = True
        if "has_on_failure_callback" in encoded_dag:
            dag.has_on_failure_callback = True

        keys_to_set_none = dag.get_serialized_fields() - encoded_dag.keys(
        ) - cls._CONSTRUCTOR_PARAMS.keys()
        for k in keys_to_set_none:
            setattr(dag, k, None)

        setattr(dag, 'full_filepath', dag.fileloc)
        for task in dag.task_dict.values():
            task.dag = dag
            serializable_task: BaseOperator = task

            for date_attr in ["start_date", "end_date"]:
                if getattr(serializable_task, date_attr) is None:
                    setattr(serializable_task, date_attr,
                            getattr(dag, date_attr))

            if serializable_task.subdag is not None:
                setattr(serializable_task.subdag, 'parent_dag', dag)
                serializable_task.subdag.is_subdag = True

            for task_id in serializable_task.downstream_task_ids:
                # Bypass set_upstream etc here - it does more than we want
                # noqa: E501 # pylint: disable=protected-access
                dag.task_dict[task_id]._upstream_task_ids.add(
                    serializable_task.task_id)

        return dag
Пример #28
0
 def _make_task_group(self, **kwargs) -> TaskGroup:
     return TaskGroup(**kwargs)
Пример #29
0
default_args = {
    'owner': 'teste',
    'depends_on_past': False,
    'start_date': datetime(2019, 1, 1),
    'retries': 0,
}

with DAG('dag-pipeline-iris-aula-v1',
         schedule_interval=timedelta(minutes=10),
         catchup=False,
         default_args=default_args) as dag:

    start = DummyOperator(task_id="start")

    with TaskGroup("etl", tooltip="etl") as etl:

        t1 = BashOperator(dag=dag,
                          task_id='download_dataset',
                          bash_command="""
            cd {0}/featurestore
            curl -o iris.txt  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
            """.format(pathScript))

        [t1]

    with TaskGroup("preProcessing", tooltip="preProcessing") as preProcessing:
        t2 = BashOperator(dag=dag,
                          task_id='encoder_dataset',
                          bash_command="""
            cd {0}
Пример #30
0
        with open(tempfile, 'r') as f:
            cursor = conn.cursor()
            cursor.copy_expert(query, f)
            conn.commit()
    finally:
        conn.close()
        os.remove(tempfile)


with DAG(dag_id=dag_id, schedule_interval=None, catchup=False, start_date=days_ago(1)) as dag:

    pause_dags_t = PythonOperator(
        task_id="pause_dags",
        python_callable=pause_dags
    )
    with TaskGroup(group_id='import') as import_t:
        for x in OBJECTS_TO_IMPORT:
            load_task = PythonOperator(
                task_id=x[1],
                python_callable=load_data,
                op_kwargs={'query': x[0], 'file': x[1]},
                provide_context=True
            )
        load_variable_t = PythonOperator(
            task_id="variable",
            python_callable=importVariable
        )

    load_task_instance_t = PythonOperator(
        task_id="load_ti",
        op_kwargs={'query': TASK_INSTANCE_IMPORT, 'file': 'task_instance.csv'},