Exemplo n.º 1
0
"""
autovacuum_cleanup

A maintenance workflow that runs `VACUUM ANALYZE` nightly on all materialized views in
the MOVE database, to mitigate service degradation and/or downtime due to autovacuum
operations.

This was put in place after [issue #482](https://github.com/CityofToronto/bdit_flashcrow/issues/482).
The default autovacuum settings in AWS RDS allowed a large number of dead tuples to build up
in our `collisions` schema, resulting in unplanned downtime once autovacuum ran.
"""
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2020, 8, 19)
SCHEDULE_INTERVAL = '0 23 * * *'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

AUTOVACUUM_CLEANUP = create_bash_task_nested(DAG, 'autovacuum_cleanup')
Exemplo n.º 2
0
# - in there, look for the URL under `result.resources[].url`.
#
TASKS = {
    'centreline': {
        'resource_url':
        'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/1d079757-377b-4564-82df-eb5638583bfb/resource/6b5415cc-864f-4702-9296-46520f479b0f/download/centreline_wgs84_v2.zip',
        'source_srid': 3857
    },
    'centreline_intersection': {
        'resource_url':
        'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2c83f641-7808-49ba-b80f-7011851d4e27/resource/1d84f2f9-f551-477e-a7fa-f92caf2ae28d/download/intersection-file-wgs84.zip',
        'source_srid': 4326
    }
}

INDEX_OPENDATA = create_bash_task_nested(DAG, 'index_opendata')

for task_id, params in TASKS.items():
    task_id_extract = '{0}_extract'.format(task_id)
    EXTRACT_OPENDATA_SHAPEFILE = BashOperator(
        task_id=task_id_extract,
        bash_command='/copy_opendata_shapefiles/extract_opendata_shapefile.sh',
        params={
            'name': task_id,
            'resource_url': params['resource_url']
        },
        dag=DAG)

    task_id_load = '{0}_load'.format(task_id)
    LOAD_SHAPEFILE = BashOperator(
        task_id=task_id_load,
Exemplo n.º 3
0
"""
centreline_vector_tiles

Generates vector tiles from the MOVE conflation target, which is built by the
`centreline_conflation_target` DAG.  These are stored in `/data/tiles`, and are served from
`/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render
interactive centreline features.

This is intended to run after `centreline_conflation_target`.
"""
# pylint: disable=pointless-statement
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2019, 5, 5)
SCHEDULE_INTERVAL = '10 5 * * 6'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

LOAD_VOLUME = create_bash_task_nested(DAG, 'load_volume')
BUILD_VECTOR_TILES = create_bash_task_nested(DAG, 'build_vector_tiles')
EXTRACT_VECTOR_TILES = create_bash_task_nested(DAG, 'extract_vector_tiles')

LOAD_VOLUME >> BUILD_VECTOR_TILES
BUILD_VECTOR_TILES >> EXTRACT_VECTOR_TILES
Exemplo n.º 4
0
"""
collisions_vector_tiles

Generates vector tiles from collisions data, which is built by the `crash_geocoding` DAG.
These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they
are used by `FcPaneMap` in the web frontend to render collisions heatmaps when zoomed out.

This is intended to run after `crash_geocoding`.
"""
# pylint: disable=pointless-statement
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2019, 12, 8)
SCHEDULE_INTERVAL = '0 3 * * 6'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

BUILD_COLLISIONS_TILES = create_bash_task_nested(DAG, 'build_collisions_tiles')
EXTRACT_COLLISIONS_TILES = create_bash_task_nested(DAG,
                                                   'extract_collisions_tiles')

BUILD_COLLISIONS_TILES >> EXTRACT_COLLISIONS_TILES
Exemplo n.º 5
0
"""
font_glyphs

Generates signed-distance field font glyphs using the `openmaptiles/fonts` repo on GitHub.
This allows us to serve our own fonts for use with MapLibre GL.

The repo is cloned to `/data/fonts`, and the resulting font glyphs are stored in `/data/glyphs`.
These are served at `/glyphs` on our web EC2 instances.
"""
# pylint: disable=pointless-statement
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2020, 10, 29)
SCHEDULE_INTERVAL = '0 0 1 * *'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

FONT_GLYPHS = create_bash_task_nested(DAG, 'font_glyphs')
Exemplo n.º 6
0
Builds the [Traffic Volumes at Intersections for All Modes](https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/)
dataset for the City of Toronto Open Data Portal.

The dataset is exposed in two ways: via database, and via HTTP.  We store the dataset as a series
of views in the `open_data` schema.  We also dump those views to CSV files at `/data/open_data`,
which is served from `/open_data` on our ETL EC2 instances.

This is intended to run after `group_multiday_counts`.
"""
# pylint: disable=pointless-statement
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2021, 3, 2)
SCHEDULE_INTERVAL = '15 7 * * 6'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

A1_TMCS_COUNT_DATA = create_bash_task_nested(DAG, 'A1_tmcs_count_data')
A1_TMCS_COUNT_METADATA = create_bash_task_nested(DAG, 'A1_tmcs_count_metadata')
A2_TMCS_LOCATIONS = create_bash_task_nested(DAG, 'A2_tmcs_locations')
A3_TMCS_JOINED = create_bash_task_nested(DAG, 'A3_tmcs_joined')
A4_TMCS_DECADES = create_bash_task_nested(DAG, 'A4_tmcs_decades')
A4_TMCS_PREVIEW = create_bash_task_nested(DAG, 'A4_tmcs_preview')

A1_TMCS_COUNT_DATA >> A2_TMCS_LOCATIONS
A1_TMCS_COUNT_METADATA >> A2_TMCS_LOCATIONS
A2_TMCS_LOCATIONS >> A3_TMCS_JOINED
A3_TMCS_JOINED >> A4_TMCS_DECADES
A3_TMCS_JOINED >> A4_TMCS_PREVIEW
Exemplo n.º 7
0
not correspond to actual intersections.  To fix this, the routing target fills in vertices for
those intersection IDs.  When routing corridors between centreline features, we use the routing
target, then filter the result down to only those features in the conflation target.

This is intended to run after `copy_opendata_shapefiles`.
"""
# pylint: disable=pointless-statement
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2020, 12, 21)
SCHEDULE_INTERVAL = '30 5 * * 6'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

A0_INTERSECTIONS_BASE = create_bash_task_nested(DAG, 'A0_intersections_base')
A0_MIDBLOCKS_BASE = create_bash_task_nested(DAG, 'A0_midblocks_base')
A1_INTERSECTION_IDS = create_bash_task_nested(DAG, 'A1_intersection_ids')
A2_INTERSECTIONS = create_bash_task_nested(DAG, 'A2_intersections')
A3_MIDBLOCK_NAMES = create_bash_task_nested(DAG, 'A3_midblock_names')
A4_MIDBLOCKS = create_bash_task_nested(DAG, 'A4_midblocks')
A5_ROUTING_VERTICES = create_bash_task_nested(DAG, 'A5_routing_vertices')
A6_ROUTING_EDGES = create_bash_task_nested(DAG, 'A6_routing_edges')

A0_INTERSECTIONS_BASE >> A1_INTERSECTION_IDS
A0_MIDBLOCKS_BASE >> A1_INTERSECTION_IDS
A1_INTERSECTION_IDS >> A2_INTERSECTIONS
A2_INTERSECTIONS >> A3_MIDBLOCK_NAMES
A3_MIDBLOCK_NAMES >> A4_MIDBLOCKS
A4_MIDBLOCKS >> A5_ROUTING_VERTICES
A5_ROUTING_VERTICES >> A6_ROUTING_EDGES
Exemplo n.º 8
0
This same heuristic was used by the legacy CRASH system to assign collisions to intersections
and midblocks.  (However, CRASH did not use the Toronto Centreline, but instead used a legacy
map layer that has been deprecated and is no longer maintained by the City.)

This is intended to run after `replicator_transfer_crash` and `centreline_conflation_target`.
"""
# pylint: disable=pointless-statement
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2019, 7, 17)
SCHEDULE_INTERVAL = '20 19 * * 1-5'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

A1_EVENTS_FIELDS_RAW = create_bash_task_nested(DAG, 'A1_events_fields_raw')
A2_EVENTS_FIELDS_NORM = create_bash_task_nested(DAG, 'A2_events_fields_norm')
A2_INVOLVED_FIELDS_RAW = create_bash_task_nested(DAG, 'A2_involved_fields_raw')
A3_INVOLVED_FIELDS_NORM = create_bash_task_nested(DAG,
                                                  'A3_involved_fields_norm')
A4_INVOLVED = create_bash_task_nested(DAG, 'A4_involved')
A5_EVENTS = create_bash_task_nested(DAG, 'A5_events')
A6_EVENTS_INTERSECTIONS = create_bash_task_nested(DAG,
                                                  'A6_events_intersections')
A6_EVENTS_SEGMENTS = create_bash_task_nested(DAG, 'A6_events_segments')
A7_EVENTS_CENTRELINE = create_bash_task_nested(DAG, 'A7_events_centreline')

A1_EVENTS_FIELDS_RAW >> A2_EVENTS_FIELDS_NORM
A1_EVENTS_FIELDS_RAW >> A2_INVOLVED_FIELDS_RAW
A2_EVENTS_FIELDS_NORM >> A3_INVOLVED_FIELDS_NORM
A2_INVOLVED_FIELDS_RAW >> A3_INVOLVED_FIELDS_NORM
Exemplo n.º 9
0
"""
docs_pg_autodoc

A documentation workflow that uses `postgresql_autodoc` to generate an HTML page with info
on tables and views in the `flashcrow` database.

The resulting page is stored at `/data/docs`, and is served at `/docs` from our ETL EC2 instances.
"""
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2021, 8, 20)
SCHEDULE_INTERVAL = '0 0 * * *'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

DOCS_PG_AUTODOC = create_bash_task_nested(DAG, 'docs_pg_autodoc')
Exemplo n.º 10
0
of the traffic study, so that we can get all six relevant counts in database.

Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no
reliable way to visualize that much data at once.

This is intended to run after `arteries_geocoding`.
"""
# pylint: disable=pointless-statement
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2020, 5, 6)
SCHEDULE_INTERVAL = '30 7 * * 6'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

A1_ARTERIES_DOUBLE_LINK_PAIRS = create_bash_task_nested(
    DAG, 'A1_arteries_double_link_pairs')
A1_ARTERIES_MIDBLOCK_SOLO = create_bash_task_nested(
    DAG, 'A1_arteries_midblock_solo')
A2_ARTERIES_GROUPS_PRE = create_bash_task_nested(DAG, 'A2_arteries_groups_pre')
A3_ARTERIES_GROUPS_RANKED = create_bash_task_nested(
    DAG, 'A3_arteries_groups_ranked')
A4_ARTERIES_GROUPS_POST = create_bash_task_nested(DAG,
                                                  'A4_arteries_groups_post')

A1_ARTERIES_DOUBLE_LINK_PAIRS >> A2_ARTERIES_GROUPS_PRE
A1_ARTERIES_MIDBLOCK_SOLO >> A2_ARTERIES_GROUPS_PRE
A2_ARTERIES_GROUPS_PRE >> A3_ARTERIES_GROUPS_RANKED
A3_ARTERIES_GROUPS_RANKED >> A4_ARTERIES_GROUPS_POST
Exemplo n.º 11
0
"""
location_search_index

Builds the views and indexes that support location search, and also builds an index of midblock
names.

This is intended to run after `centreline_conflation_target` and `copy_gis_layers`
"""
# pylint: disable=pointless-statement
from datetime import datetime

from airflow_utils import create_dag, create_bash_task_nested

START_DATE = datetime(2020, 4, 5)
SCHEDULE_INTERVAL = '30 6 * * 6'
DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL)

TRANSFORM_CENTRELINE_INDEX = create_bash_task_nested(DAG, 'transform_centreline_index')
TRANSFORM_INTERSECTIONS_INDEX = create_bash_task_nested(DAG, 'transform_intersections_index')
TRANSFORM_TRAFFIC_SIGNAL = create_bash_task_nested(DAG, 'transform_traffic_signal')

TRANSFORM_CENTRELINE_INDEX >> TRANSFORM_INTERSECTIONS_INDEX