Exemplo n.º 1
0
        dest_s3_key='budget/'+f,
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)


    pos = [i for i, e in enumerate(categories) if e == cat]

    if cat == "budget_reference":
        ds_task = get_refs
        md = 'budget-reference-'+f.split('_')[2]

        #: Update portal modified date
        update_budget_md = get_seaboard_update_dag(md+'.md', dag)

        #: update md task must run after the upload task
        update_budget_md.set_upstream(upload_task)

    else:
        if cat == "budget_capital":
            ds_task = make_capital
            if 'ptd' in f:
                md = 'capital-budget-ptd'
            else:
                md = 'capital-budget-fy'
        elif cat == "budget_operating":
            ds_task = make_operating
            md = 'operating-budget'
        elif cat == "actuals_capital":
def create_sde_tasks(dag,
                     folder,
                     layer,
                     datasd_name,
                     md,
                     path_to_file,
                     sde_to_shp):
    """Dynamically create SDE Airflow tasks.

    dag: DAG defined in _dags file.
    folder: subfolder in the sde folder on S3.
    layer: layer name.
    datasd_name: layer name + _datasd.
    md: name of md file on Seaboard.
    path_to_file: poseidon path + datasd_name.
    sde_to_shp: _jobs specific sde_to_shp function
    """
    #: Latest Only Operator for sde layer
    sde_latest_only = LatestOnlyOperator(task_id='{layer}_latest_only'
                                         .format(layer=layer),
                                         dag=dag)

    #: Convert sde table to shapefile format
    to_shp = PythonOperator(
        task_id='{layer}_to_shp'.format(layer=layer),
        python_callable=sde_to_shp,
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Convert shapefile to GeoJSON format
    to_geojson = BashOperator(
        task_id='{layer}_to_geojson'.format(layer=layer),
        bash_command=shp_to_geojson(path_to_file),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Convert shapefile to TopoJSON format
    to_topojson = BashOperator(
        task_id='{layer}_to_topojson'.format(layer=layer),
        bash_command=shp_to_topojson(path_to_file),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Compress shapefile components
    to_zip = PythonOperator(
        task_id='{layer}_shp_to_zip'.format(layer=layer),
        python_callable=shp_to_zip,
        op_kwargs={'datasd_name': datasd_name},
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Upload shapefile to S3
    shp_to_S3 = S3FileTransferOperator(
        task_id='{layer}_shp_to_S3'.format(layer=layer),
        source_base_path=conf['prod_data_dir'],
        source_key='{datasd_name}.zip'.format(datasd_name=datasd_name),
        dest_s3_conn_id=conf['default_s3_conn_id'],
        dest_s3_bucket=conf['dest_s3_bucket'],
        dest_s3_key='sde/{folder}/{datasd_name}.zip'
                    .format(folder=folder, datasd_name=datasd_name),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)

    #: Upload geojson to S3
    geojson_to_S3 = S3FileTransferOperator(
        task_id='{layer}_geojson_to_S3'.format(layer=layer),
        source_base_path=conf['prod_data_dir'],
        source_key='{datasd_name}.geojson'.format(datasd_name=datasd_name),
        dest_s3_conn_id=conf['default_s3_conn_id'],
        dest_s3_bucket=conf['dest_s3_bucket'],
        dest_s3_key='sde/{folder}/{datasd_name}.geojson'
                    .format(folder=folder, datasd_name=datasd_name),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)

    #: Upload topojson to S3
    topojson_to_S3 = S3FileTransferOperator(
        task_id='{layer}_topojson_to_S3'.format(layer=layer),
        source_base_path=conf['prod_data_dir'],
        source_key='{datasd_name}.topojson'.format(datasd_name=datasd_name),
        dest_s3_conn_id=conf['default_s3_conn_id'],
        dest_s3_bucket=conf['dest_s3_bucket'],
        dest_s3_key='sde/{folder}/{datasd_name}.topojson'
                    .format(folder=folder, datasd_name=datasd_name),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)

    #: Update portal modified date
    update_md = get_seaboard_update_dag('{md}.md'.format(md=md), dag)

    if layer not in no_pbf:
        #: Convert GeoJSON to Geobuf format
        to_geobuf = PythonOperator(
            task_id='{layer}_to_geobuf'.format(layer=layer),
            python_callable=geojson_to_geobuf,
            op_kwargs={'path_to_file': path_to_file},
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
            dag=dag)

        #: Convert geobuf to gzipped geobuf
        to_gzip = PythonOperator(
            task_id='{layer}_geobuf_to_gzip'.format(layer=layer),
            python_callable=geobuf_to_gzip,
            op_kwargs={'datasd_name': datasd_name},
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
            dag=dag)

        #: Upload geobuf to S3
        geobuf_to_S3 = S3FileTransferOperator(
            task_id='{layer}_geobuf_to_S3'.format(layer=layer),
            source_base_path=conf['prod_data_dir'],
            source_key='{datasd_name}.pbf'.format(datasd_name=datasd_name),
            dest_s3_conn_id=conf['default_s3_conn_id'],
            dest_s3_bucket=conf['dest_s3_bucket'],
            dest_s3_key='sde/{folder}/{datasd_name}.pbf'
                        .format(folder=folder, datasd_name=datasd_name),
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
            replace=True,
            use_gzip=True,
            dag=dag)

        #: Conversion to geobuf is triggered after conversion to geojson.
        to_geobuf.set_upstream(to_geojson)

        #: Compression to gzip is triggered after conversion to geobuf.
        to_gzip.set_upstream(to_geobuf)

        #: geobuf upload to S3 is triggered after compression to gzipped geobuf.
        geobuf_to_S3.set_upstream(to_gzip)

        #: Github update depends on shapefile S3 upload success.
        update_md.set_upstream(geobuf_to_S3)

    #: Execution rules:
    #: sde_latest_only must run before shp conversion.
    to_shp.set_upstream(sde_latest_only)

    #: Conversion to geojson is triggered after conversion to shp.
    to_geojson.set_upstream(to_shp)

    #: Conversion to topojson is triggered after conversion to shapefile.
    to_topojson.set_upstream(to_shp)

    #: Compression to zip is triggered after conversion to geojson and topojson.
    to_zip.set_upstream(to_geojson)
    to_zip.set_upstream(to_topojson)

    #: shapefile upload to S3 is triggered after conversion to zip.
    shp_to_S3.set_upstream(to_zip)

    #: geojson upload to S3 is triggered after conversion to geojson.
    geojson_to_S3.set_upstream(to_geojson)

    #: topojson upload to S3 is triggered after conversion to topojson.
    topojson_to_S3.set_upstream(to_topojson)

    #: Github update depends on shapefile S3 upload success.
    update_md.set_upstream(shp_to_S3)
    update_md.set_upstream(geojson_to_S3)
    update_md.set_upstream(topojson_to_S3)
Exemplo n.º 3
0
#: Upload prod SE file to S3
upload_special_events = S3FileTransferOperator(
    task_id='upload_special_events',
    source_base_path=conf['prod_data_dir'],
    source_key='special_events_list_datasd.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='special_events/special_events_list_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_special_events_md = get_seaboard_update_dag('special-events.md', dag)

#: Execution rules

#: se_latest_only must run before get_special_events
get_special_events.set_upstream(se_latest_only)

#: process_special_events dependent on get_special_events
process_special_events.set_upstream(get_special_events)

#: upload_special_events dependent on process_special_events
upload_special_events.set_upstream(process_special_events)

#: update github modified date after S3 upload
update_special_events_md.set_upstream(upload_special_events)
Exemplo n.º 4
0
#: Upload prod file to S3
cfs_to_S3 = S3FileTransferOperator(
    task_id='cfs_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='pd_calls_for_service_'+curr_year+'_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='pd/pd_calls_for_service_'+curr_year+'_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_pd_cfs_md = get_seaboard_update_dag('police-calls-for-service.md', dag)

#: Execution rules:

#: pd_cfs_latest_only must run before pd_cfs_data
get_cfs_data.set_upstream(pd_cfs_latest_only)

#: Data processing is triggered after data retrieval.
process_cfs_data.set_upstream(get_cfs_data)

#: Data upload to S3 is triggered after data processing completion.
cfs_to_S3.set_upstream(process_cfs_data)

#: Github update depends on S3 upload success.
update_pd_cfs_md.set_upstream(cfs_to_S3)
Exemplo n.º 5
0
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)

#: Upload Inventory CSV to S3
upload_inventory = S3FileTransferOperator(
    task_id='upload_inventory',
    source_base_path=conf['prod_data_dir'],
    source_key='inventory_datasd_v1.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='inventory/inventory_datasd_v1.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_inventory_md = get_seaboard_update_dag('data-inventory.md', dag)

#: Execution Rules
#: Latest only for inventory to csv
inventory_to_csv.set_upstream(inv_latest_only)
#: Inventory csv gets created before its uploaded
upload_inventory.set_upstream(inventory_to_csv)

#: upload_gid_requests must succeed before updating github
update_inventory_md.set_upstream(upload_inventory)
Exemplo n.º 6
0
#: Upload topojson GIS file to S3
upload_pbf_file = S3FileTransferOperator(
    task_id='sidewalks_pbf_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='sidewalks.pbf',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw/sidewalks.pbf',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_gis_md = get_seaboard_update_dag('sidewalk-gis.md', dag)

#: Execution order

#: Latest only operator must run before getting sidewalk data
get_sidewalk_data.set_upstream(sidewalk_latest_only)

#: Getting sidewalk data must run before uploading
upload_oci_file.set_upstream(get_sidewalk_data)

#: get_sidewalk_data must run before get shapefiles so they can be joined
get_sw_shapefiles.set_upstream(sidewalk_latest_only)

#: get_sw_shapefiles must run before converting to geojson
sidewalks_to_geojson.set_upstream(get_sw_shapefiles)
get_approvals_files = PythonOperator(
    task_id='get_approvals_files',
    python_callable=dfg.get_files,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    op_kwargs={'fname_list': fnames,
               'target_dir': dsd_temp_dir},
    dag=dag)


#: dsd_approvals_latest_only must run before get_approvals_files
get_approvals_files.set_upstream(dsd_approvals_latest_only)

#: update github modified date (solar permits)
update_solar_md = get_seaboard_update_dag('solar-permits.md', dag)

for key in app.approval_dict:

    #: Consolidate weekly permitting data by scraping OpenDSD API
    scrape_dsd = PythonOperator(
        task_id='scrape_dsd_' + key,
        python_callable=app.scrape_dsd,
        op_kwargs={'key': key},
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Add consolidated weekly data to current prod data
    update_dsd = PythonOperator(
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update data inventory json
update_json_date = PythonOperator(
    task_id='update_json_date',
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'civic_art_collection'},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_public_art_md = get_seaboard_update_dag('public-art.md', dag)

#: Execution rules
#: public_art_latest_only must run before get_public_art
get_public_art.set_upstream(public_art_latest_only)
#: public_art_latest_only must run before get_public_art
process_public_art.set_upstream(get_public_art)
#: get_public_art must run before file upload
upload_public_art.set_upstream(process_public_art)
#: upload_gid_requests must succeed before updating github
update_public_art_md.set_upstream(upload_public_art)
#: upload data must succeed before updating json
update_json_date.set_upstream(upload_public_art)
Exemplo n.º 9
0
    on_success_callback=notify,
    replace=True,
    dag=dag)

update_json_date = PythonOperator(
    task_id='update_json_date',
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'special_events'},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_special_events_md = get_seaboard_update_dag('special-events.md', dag)

#: Execution rules

#: se_latest_only must run before get_special_events
get_special_events.set_upstream(se_latest_only)

#: process_special_events dependent on get_special_events
process_special_events.set_upstream(get_special_events)

#: process_special_events dependent on get_special_events
addresses_to_S3.set_upstream(process_special_events)

#: upload_special_events dependent on process_special_events
upload_special_events.set_upstream(process_special_events)
#: upload_special_events dependent on process_special_events
Exemplo n.º 10
0
#: Upload topojson GIS file to S3
upload_pbf_file = S3FileTransferOperator(
    task_id='tree_canopy_pbf_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='tree_canopy_datasd.pbf',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='sde/tree_canopy_datasd.pbf',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_gis_md = get_seaboard_update_dag('tree-canopy-2014.md', dag)

#: Execution order

#: Latest only operator must run before getting tree canopy data
get_shapefiles.set_upstream(treecan_latest_only)

#: get_shapefiles must run before converting to geojson
shp_to_geojson.set_upstream(get_shapefiles)

#: to_geojson must run before converting to geobuf
geojson_to_geobuf.set_upstream(shp_to_geojson)

#: to_geobuf must run before zipping geobuf
geobuf_zip.set_upstream(geojson_to_geobuf)
Exemplo n.º 11
0
for index, dataset in enumerate(datasets):
    update_date_mod_json = PythonOperator(
        task_id=f"update_json_date_{dataset}",
        python_callable=update_json_date,
        provide_context=True,
        op_kwargs={'ds_fname': dataset},
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: upload data must succeed before updating json
    update_date_mod_json.set_upstream(s3_uploaders[index])

#: Update leases portal modified date
update_leases_md = get_seaboard_update_dag('city-owned-properties-leases.md',
                                           dag)

#: Update details portal modified date
update_details_md = get_seaboard_update_dag('city-owned-properties-details.md',
                                            dag)

#: Update parcels portal modified date
update_parcels_md = get_seaboard_update_dag('city-owned-properties-parcels.md',
                                            dag)

#: Execution Rules

#: read_latest_only must run before get_billing
get_billing.set_upstream(read_latest_only)

#: read_latest_only must run before get_leases
#: Uploads the generated agg file
upload_by_day_agg = S3FileTransferOperator(
    task_id='upload_by_day_agg',
    source_base_path=conf['prod_data_dir'],
    source_key=flist['by_day'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='parking_meters/' + flist['by_day'],
    replace=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_parking_trans_md = get_seaboard_update_dag('parking-meters-transactions.md', dag)

#: Execution Rules

#: parking_meters_latest_only must run before get_parking_files
get_parking_files.set_upstream(parking_meters_latest_only)

#: Download Files, build prod file.

#: build_prod_file depends on get_parking_files:
build_prod_file.set_upstream(get_parking_files)

#: Upload Prod File

#: upload_prod_file depends on build_prod_file
upload_prod_file.set_upstream(build_prod_file)
Exemplo n.º 13
0
        task_id='create_sdif_{}_miles_paved_sonar'.format(i),
        range_id='days_30',
        value_key='sdif_{}_miles'.format(i),
        value_desc='Miles Paved {}'.format(i),
        python_callable=build_sonar_miles_aggs,
        op_kwargs={'mode': 'sdif',
                   'pav_type': i},
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Depends on successful run of get_streets_data
    sonar_task.set_upstream(get_streets_data)



#: Update portal modified date
update_streets_md = get_seaboard_update_dag('streets-repair-projects.md', dag)

#: Execution order

#: streets_latest_only must run before get_streets_data
get_streets_data.set_upstream(streets_latest_only)

#: upload_streets_data is dependent on successful run of get_streets_data
upload_streets_data.set_upstream(get_streets_data)

#: update md file after upload to S3FileTransferOperator
update_streets_md.set_upstream(upload_streets_data)
Exemplo n.º 14
0
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Create subsets
create_subsets = PythonOperator(
    task_id='create_subsets',
    python_callable=make_prod_files,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_ttcs_md = get_seaboard_update_dag('business-listings.md', dag)

#: Execution Rules

#: ttcs_latest_only must run before get_active
get_active_businesses.set_upstream(ttcs_latest_only)
#: ttcs_latest_only must run before get_bids
clean_data.set_upstream(get_active_businesses)
#: Data cleaning occurs after BIDs data retrieval.
geocode_data.set_upstream(clean_data)
#: spatial join occurs after geocoding.
join_bids.set_upstream(geocode_data)
#: last 3mo subsetting occurs after spatial join
create_subsets.set_upstream(join_bids)

subset_names = [os.path.basename(x) for x in glob.glob(conf['prod_data_dir']+'/sd_businesses_*.csv')]
Exemplo n.º 15
0
upload_latest_indicator_bac_tests = S3FileTransferOperator(
    task_id='upload_latest_indicator_bac_tests',
    source_base_path=conf['prod_data_dir'],
    source_key='latest_indicator_bac_tests_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='water_testing/latest_indicator_bac_tests_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_water_md = get_seaboard_update_dag(
    'monitoring-of-indicator-bacteria-in-drinking-water.md',
    dag)

#: Execution Rules

#: water_latest_only must run before get_indicator_bac_tests
get_indicator_bac_tests.set_upstream(wtr_latest_only)
#: Upload indicator bac tests after it has successfully run
upload_indicator_bac_tests.set_upstream(get_indicator_bac_tests)

#: get_last_bac_tests is dependent on get_indicator_bac_tests
get_latest_bac_tests.set_upstream(get_indicator_bac_tests)

#: Upload latest indicator bac tests after the file has been generated
upload_latest_indicator_bac_tests.set_upstream(get_latest_bac_tests)
upload_prod_file = S3FileTransferOperator(
    task_id='upload_meter_locs',
    source_base_path=conf['prod_data_dir'],
    source_key='treas_parking_meters_loc_datasd_v1.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='parking_meters/treas_parking_meters_loc_datasd_v1.csv',
    replace=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update data inventory json
update_json_date = PythonOperator(
    task_id='update_json_date',
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'parking_meters_locations'},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_parking_trans_md = get_seaboard_update_dag('parking_meters_locations.md', dag)

#: Execution Rules

get_parking_files >> build_prod_file >> clean_daily_files >> upload_prod_file >> [update_parking_trans_md, update_json_date]
Exemplo n.º 17
0
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload prod art file to S3
upload_public_art = S3FileTransferOperator(
    task_id='upload_public_art',
    source_base_path=conf['prod_data_dir'],
    source_key='public_art_locations_datasd.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='public_art/public_art_locations_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_public_art_md = get_seaboard_update_dag('public-art.md', dag)


#: Execution rules
#: public_art_latest_only must run before get_public_art
get_public_art.set_upstream(public_art_latest_only)
#: get_public_art must run before file upload
upload_public_art.set_upstream(get_public_art)
#: upload_gid_requests must succeed before updating github
update_public_art_md.set_upstream(upload_public_art)
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

update_json_date = PythonOperator(task_id='update_json_date',
                                  python_callable=update_json_date,
                                  provide_context=True,
                                  op_kwargs={'ds_fname': 'traffic_volumes'},
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)

#: Update portal modified date
update_traffic_md = get_seaboard_update_dag('traffic-volumes.md', dag)

#: Execution Rules

#: traffic_counts_latest_only must run before get_traffic_counts
get_traffic_counts.set_upstream(tc_latest_only)
#: Cleaning task triggered after data retrieval.
clean_traffic_counts.set_upstream(get_traffic_counts)
#: Production build task triggered after cleaning task.
build_traffic_counts.set_upstream(clean_traffic_counts)
#: Data upload to S3 triggered after production build task.
upload_traffic_counts.set_upstream(build_traffic_counts)
#: Update .md file after S3 upload
update_traffic_md.set_upstream(upload_traffic_counts)
#: upload data must succeed before updating json
update_json_date.set_upstream(upload_traffic_counts)
Exemplo n.º 19
0
    on_success_callback=notify,
    replace=True,
    dag=dag)

update_json_date = PythonOperator(
    task_id='update_json_date',
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'indicator_bacteria_monitoring'},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_water_md = get_seaboard_update_dag(
    'monitoring-of-indicator-bacteria-in-drinking-water.md', dag)

#: Execution Rules

#: water_latest_only must run before get_indicator_bac_tests
get_indicator_bac_tests.set_upstream(wtr_latest_only)
#: Upload indicator bac tests after it has successfully run
upload_indicator_bac_tests.set_upstream(get_indicator_bac_tests)

#: get_last_bac_tests is dependent on get_indicator_bac_tests
get_latest_bac_tests.set_upstream(get_indicator_bac_tests)

#: Upload latest indicator bac tests after the file has been generated
upload_latest_indicator_bac_tests.set_upstream(get_latest_bac_tests)

#: update .md file after S3 upload
Exemplo n.º 20
0
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update data inventory json
update_json_date = PythonOperator(task_id='update_json_date',
                                  python_callable=update_json_date,
                                  provide_context=True,
                                  op_kwargs={'ds_fname': 'traffic_collisions'},
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)

#: Update portal modified date
update_pd_cls_md = get_seaboard_update_dag('police-collisions.md', dag)

#: Execution rules:

#: pd_col_latest_only must run before get_collisions_data
get_collisions_data.set_upstream(pd_col_latest_only)

#: Data processing is triggered after data retrieval.
process_collisions_data.set_upstream(get_collisions_data)

#: Data upload to S3 is triggered after data processing completion.
collisions_to_S3.set_upstream(process_collisions_data)

#: Github update depends on S3 upload success.
update_pd_cls_md.set_upstream(collisions_to_S3)
Exemplo n.º 21
0
upload_dsd_permits = S3FileTransferOperator(
   task_id='upload_dsd_permits',
   source_base_path=conf['prod_data_dir'],
   source_key='dsd_permits_{}_datasd_v1.csv'.format(year),
   dest_s3_bucket=conf['dest_s3_bucket'],
   dest_s3_conn_id=conf['default_s3_conn_id'],
   dest_s3_key='dsd/' + 'dsd_permits_{}_datasd_v1.csv'.format(year),
   replace=True,
   on_failure_callback=notify,
   on_retry_callback=notify,
   on_success_callback=notify,
   dag=dag)


#: update permits.md file
update_permits_md = get_seaboard_update_dag('permits-dsd.md', dag)


#: Execution rules

#: dsd_permits_latest_only must run before get_permits_files
get_permits_files.set_upstream(dsd_permits_latest_only)

#: clean_data tasks are executed after get_approvals_files task
clean_data.set_upstream(get_permits_files)

#: upload_dsd tasks are executed after clean_data tasks
join_bids.set_upstream(clean_data)

#: upload_dsd tasks are executed after join bids tasks
upload_dsd_permits.set_upstream(join_bids)
Exemplo n.º 22
0
   task_id='upload_solar_permits',
   source_base_path=conf['prod_data_dir'],
   source_key='solar_permits_{}_datasd.csv'.format(year),
   dest_s3_bucket=conf['dest_s3_bucket'],
   dest_s3_conn_id=conf['default_s3_conn_id'],
   dest_s3_key='dsd/' + 'solar_permits_{}_datasd.csv'.format(year),
   replace=True,
   on_failure_callback=notify,
   on_retry_callback=notify,
   on_success_callback=notify,
   dag=dag)



#: update permits.md file
update_permits_md = get_seaboard_update_dag('permits.md', dag)

#: update permits.md file
update_solar_md = get_seaboard_update_dag('solar-permits.md', dag)


#: Execution rules

#: dsd_permits_latest_only must run before get_permits_files
get_permits_files.set_upstream(dsd_permits_latest_only)

#: clean_data tasks are executed after get_approvals_files task
clean_data.set_upstream(get_permits_files)

#: upload_dsd tasks are executed after clean_data tasks
join_bids.set_upstream(clean_data)
Exemplo n.º 23
0
#: Upload prod file to S3
collisions_to_S3 = S3FileTransferOperator(
    task_id='collisions_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='pd_collisions_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='pd/pd_collisions_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_pd_cls_md = get_seaboard_update_dag('police-collisions.md', dag)

#: Execution rules:

#: pd_col_latest_only must run before get_collisions_data
get_collisions_data.set_upstream(pd_col_latest_only)

#: Data processing is triggered after data retrieval.
process_collisions_data.set_upstream(get_collisions_data)

#: Data upload to S3 is triggered after data processing completion.
collisions_to_S3.set_upstream(process_collisions_data)

#: Github update depends on S3 upload success.
update_pd_cls_md.set_upstream(collisions_to_S3)
Exemplo n.º 24
0
    task_id='dsd_code_enf_latest_only', dag=dag)


#: Download code enforcement files and unzip them.
get_code_enf_files = PythonOperator(
    task_id='get_code_enf_files',
    python_callable=dfg.get_files,
    op_kwargs={'fname_list': fname_list,
               'target_dir': dsd_temp_dir},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_code_enf_md = get_seaboard_update_dag('code-enforcement-violations.md', dag)

#: Execution rules
#: dsd_code_enf_latest_only must run before get_code_enf_files
get_code_enf_files.set_upstream(dsd_ce_latest_only)


for i in fname_list:
    #: Create fme shell command
    build_csv_task = BashOperator(
        task_id='get_' + i,
        bash_command=get_bash_command(i),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)
Exemplo n.º 25
0
                           python_callable=join_bids,
                           on_failure_callback=notify,
                           on_retry_callback=notify,
                           on_success_callback=notify,
                           dag=dag)

#: Create subsets
create_subsets = PythonOperator(task_id='create_subsets',
                                python_callable=make_prod_files,
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
                                dag=dag)

#: Update portal modified date
update_ttcs_md = get_seaboard_update_dag('business-listings.md', dag)

#: Execution Rules

#: ttcs_latest_only must run before get_active
get_active_businesses.set_upstream(ttcs_latest_only)
#: ttcs_latest_only must run before get_bids
clean_data.set_upstream(get_active_businesses)
#: Data cleaning occurs after BIDs data retrieval.
geocode_data.set_upstream(clean_data)
#: Address book is uploaded after geocoding occurs
addresses_to_S3.set_upstream(geocode_data)
#: Spatial join occurs after geocoding.
join_bids.set_upstream(geocode_data)
#: Subsetting occurs after spatial join
create_subsets.set_upstream(join_bids)
Exemplo n.º 26
0
#: Upload prod file to S3
hc_to_S3 = S3FileTransferOperator(task_id='prod_file_to_S3',
                                  source_base_path=conf['prod_data_dir'],
                                  source_key='hate_crimes_datasd.csv',
                                  dest_s3_bucket=conf['dest_s3_bucket'],
                                  dest_s3_conn_id=conf['default_s3_conn_id'],
                                  dest_s3_key='pd/hate_crimes_datasd.csv',
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)

#: Update data inventory json
update_hc_date = PythonOperator(task_id='update_json_date',
                                python_callable=update_json_date,
                                provide_context=True,
                                op_kwargs={'ds_fname': 'hate_crimes'},
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
                                dag=dag)

#: Update portal modified date
update_pd_hc_md = get_seaboard_update_dag('police-hate-crimes.md', dag)

#: Execution rules:

pd_hc_latest_only >> get_hc_data >> process_hc_data >> hc_to_S3 >> [
    update_hc_date, update_pd_hc_md
]
    on_success_callback=notify,
    dag=dag)

#: Update data inventory json
update_json_date = PythonOperator(
    task_id='update_json_date',
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'parking_meters_transactions'},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_parking_trans_md = get_seaboard_update_dag('parking-meters-transactions.md', dag)

#: Execution Rules

#: parking_meters_latest_only must run before get_parking_files
get_parking_files.set_upstream(parking_meters_latest_only)

#: Download Files, build prod file.

#: build_prod_file depends on get_parking_files:
build_prod_file.set_upstream(get_parking_files)

#: Upload Prod File

#: upload_prod_file depends on build_prod_file
upload_prod_file.set_upstream(build_prod_file)
#: Uploads the generated production file
upload_traffic_counts = S3FileTransferOperator(
    task_id='upload_traffic_counts',
    source_base_path=conf['prod_data_dir'],
    source_key='traffic_counts_datasd.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='traffic_counts/traffic_counts_datasd.csv',
    replace=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update portal modified date
update_traffic_md = get_seaboard_update_dag('traffic-volumes.md', dag)

#: Execution Rules

#: traffic_counts_latest_only must run before get_traffic_counts
get_traffic_counts.set_upstream(tc_latest_only)
#: Cleaning task triggered after data retrieval.
clean_traffic_counts.set_upstream(get_traffic_counts)
#: Production build task triggered after cleaning task.
build_traffic_counts.set_upstream(clean_traffic_counts)
#: Data upload to S3 triggered after production build task.
upload_traffic_counts.set_upstream(build_traffic_counts)
#: Update .md file after S3 upload
update_traffic_md.set_upstream(upload_traffic_counts)
Exemplo n.º 29
0
#: Upload OCI file to S3
upload_oci_file = S3FileTransferOperator(
    task_id='upload_oci',
    source_base_path=conf['prod_data_dir'],
    source_key='sidewalk_cond_datasd.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw/sidewalk_cond_datasd.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_oci_md = get_seaboard_update_dag('sidewalk-oci.md', dag)

#: Upload shp GIS file to S3
upload_shp_file = S3FileTransferOperator(
    task_id='sidewalks_shp_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='sidewalks.zip',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw/sidewalks.zip',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)