#: Upload clean and geocode claims data
clean_geocode = PythonOperator(task_id='clean_geocode_claims',
                               python_callable=clean_geocode_claims,
                               on_failure_callback=notify,
                               on_retry_callback=notify,
                               on_success_callback=notify,
                               dag=dag)

#: Upload prod claims file to S3
upload_claimstat_clean = S3FileTransferOperator(
    task_id='upload_claimstat_clean',
    source_base_path=conf['prod_data_dir'],
    source_key='claim_stat_datasd.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='risk/claims_clean_datasd_v1.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

upload_addresses_to_S3 = S3FileTransferOperator(
    task_id='upload_claims_address_book',
    source_base_path=conf['temp_data_dir'],
    source_key='claims_address_book.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['ref_s3_bucket'],
    dest_s3_key='claims_address_book.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
示例#2
0
#: Inventory Doc To CSV
inventory_to_csv = PythonOperator(task_id='inventory_to_csv',
                                  python_callable=inventory_to_csv,
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)

#: Upload Inventory CSV to S3
upload_inventory = S3FileTransferOperator(
    task_id='upload_inventory',
    source_base_path=conf['prod_data_dir'],
    source_key='inventory_datasd_v1.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='inventory/inventory_datasd_v1.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update portal modified date
update_inventory_md = get_seaboard_update_dag('data-inventory.md', dag)

#: Execution Rules
#: Latest only for inventory to csv
inventory_to_csv.set_upstream(inv_latest_only)
#: Inventory csv gets created before its uploaded
upload_inventory.set_upstream(inventory_to_csv)
示例#3
0
#: Get fire_department data from DB
get_fd_data = PythonOperator(task_id='get_fd_data',
                             python_callable=get_fd_data,
                             provide_context=True,
                             on_failure_callback=notify,
                             on_retry_callback=notify,
                             on_success_callback=notify,
                             dag=dag)

#: Upload prod fire_department_SD.csv file to S3
upload_fd_data = S3FileTransferOperator(
    task_id='upload_fd_data',
    source_base_path=conf['prod_data_dir'],
    source_key='fd_incidents_{0}_datasd_v1.csv'.format(cur_yr),
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='fd_cad/' + 'fd_incidents_{0}_datasd_v1.csv'.format(cur_yr),
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update data inventory json
update_json_date = PythonOperator(task_id='update_json_date',
                                  python_callable=update_json_date,
                                  provide_context=True,
                                  op_kwargs={'ds_fname': 'fire_ems_incidents'},
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)
#: Combine all transactions
combine_schedules = PythonOperator(task_id='combine_all_schedules',
                                   python_callable=combine_all_schedules,
                                   on_failure_callback=notify,
                                   on_retry_callback=notify,
                                   on_success_callback=notify,
                                   dag=dag)

#: Upload prod transactions file to S3
upload_fin_support = S3FileTransferOperator(
    task_id='upload_financial_support',
    source_base_path=conf['prod_data_dir'],
    source_key='financial_support_' + str(cur_yr) + '_datasd_v1.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],  # What is this supposed to be?
    dest_s3_key='campaign_fin/financial_support_' + str(cur_yr) +
    '_datasd_v1.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update data inventory json
update_json_date = PythonOperator(
    task_id='update_json_date',
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'financial_trans_election_comms'},
    on_failure_callback=notify,
    on_retry_callback=notify,
示例#5
0
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

event_files = ["pkin","pkout"]

for file in event_files:
    file_time = datetime.now().strftime('%Y_%m_%d_') 
    file_name = f'{file_time}{file}.json'
    s3_upload = S3FileTransferOperator( # creating a different upload object for each...
        task_id=f'upload_{file}',
        source_base_path=conf['prod_data_dir'],
        source_key=file_name,
        dest_s3_conn_id=conf['default_s3_conn_id'],
        dest_s3_bucket=conf['dest_s3_bucket'],
        dest_s3_key=f'cityiq/{file_name}',
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)

    #: Upload after getting events
    s3_upload.set_upstream(get_parking_bbox)
    

#: Execution Rules

#: Must get token after latest only operator
get_token_response.set_upstream(cityiq_latest_only)
#: Get events after getting token
    task_id='process_imcat',
    python_callable=process_paving_data,
    op_kwargs={'mode': 'imcat'},
    provide_context=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload imcat streets file to S3
upload_imcat_data = S3FileTransferOperator(
    task_id='upload_streets_data_imcat',
    source_base_path=conf['prod_data_dir'],
    source_key='sd_paving_imcat_datasd_v1.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw/sd_paving_imcat_datasd_v1.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Upload sdif streets file to S3
upload_sdif_data = S3FileTransferOperator(
    task_id='upload_streets_data_sdif',
    source_base_path=conf['prod_data_dir'],
    source_key='sd_paving_datasd_v1.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw/sd_paving_datasd_v1.csv',
    on_failure_callback=notify,
示例#7
0
prod_files = budget_files + actuals_files
categories = ['_'.join(x.split('_')[0:2])
        for x in prod_files]

for index, f in enumerate(prod_files):

    cat = categories[index]

    #: Upload budget files to S3
    upload_task = S3FileTransferOperator(
        task_id='upload_'+f[0:-11],
        source_base_path=conf['prod_data_dir'],
        source_key=f,
        dest_s3_conn_id=conf['default_s3_conn_id'],
        dest_s3_bucket=conf['dest_s3_bucket'],
        dest_s3_key='budget/'+f,
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)


    pos = [i for i, e in enumerate(categories) if e == cat]

    if cat == "budget_reference":
        ds_task = get_refs
        md = 'budget-reference-'+f.split('_')[2]

        #: Update portal modified date
        update_budget_md = get_seaboard_update_dag(md+'.md', dag)
#: Join BIDs to permits
join_bids = PythonOperator(
    task_id='join_bids',
    python_callable=join_bids,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Upload data to S3
upload_dsd_permits = S3FileTransferOperator(
   task_id='upload_dsd_permits',
   source_base_path=conf['prod_data_dir'],
   source_key='dsd_permits_{}_datasd_v1.csv'.format(year),
   dest_s3_bucket=conf['dest_s3_bucket'],
   dest_s3_conn_id=conf['default_s3_conn_id'],
   dest_s3_key='dsd/' + 'dsd_permits_{}_datasd_v1.csv'.format(year),
   replace=True,
   on_failure_callback=notify,
   on_retry_callback=notify,
   on_success_callback=notify,
   dag=dag)


#: update permits.md file
update_permits_md = get_seaboard_update_dag('permits-dsd.md', dag)


#: Execution rules

#: dsd_permits_latest_only must run before get_permits_files
get_permits_files.set_upstream(dsd_permits_latest_only)
示例#9
0
                            dag=dag)

#: Geocode new entries and update production file
geocode_data = PythonOperator(task_id='geocode_data',
                              python_callable=geocode_data,
                              on_failure_callback=notify,
                              on_retry_callback=notify,
                              on_success_callback=notify,
                              dag=dag)

addresses_to_S3 = S3FileTransferOperator(
    task_id='upload_address_book',
    source_base_path=conf['prod_data_dir'],
    source_key='ttcs_address_book.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['ref_s3_bucket'],
    dest_s3_key='ttcs_address_book.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Spatially join BIDs data
join_bids = PythonOperator(task_id='join_bids',
                           python_callable=join_bids,
                           on_failure_callback=notify,
                           on_retry_callback=notify,
                           on_success_callback=notify,
                           dag=dag)

#: Create subsets
示例#10
0
#: Convert geojson to geobuf
shape_zip = PythonOperator(task_id='shape_to_zip',
                           python_callable=shp_to_zip,
                           on_failure_callback=notify,
                           on_retry_callback=notify,
                           on_success_callback=notify,
                           dag=dag)

#: Upload shp GIS file to S3
upload_shp_file = S3FileTransferOperator(
    task_id='tree_canopy_shp_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='tree_canopy_datasd.zip',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='sde/tree_canopy_datasd.zip',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Upload geojson GIS file to S3
upload_geojson_file = S3FileTransferOperator(
    task_id='tree_canopy_geojson_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='tree_canopy_datasd.geojson',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='sde/tree_canopy_datasd.geojson',
    on_failure_callback=notify,
示例#11
0
# Get last bacteria tests for any given point.
get_latest_bac_tests = PythonOperator(task_id='get_latest_bac_tests',
                                      python_callable=get_latest_bac_tests,
                                      on_failure_callback=notify,
                                      on_retry_callback=notify,
                                      on_success_callback=notify,
                                      dag=dag)

# Uploads the indicator bacteria tests full result.
upload_indicator_bac_tests = S3FileTransferOperator(
    task_id='upload_indicator_bac_tests',
    source_base_path=conf['prod_data_dir'],
    source_key='indicator_bacteria_tests_datasd_v1.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='water_testing/indicator_bacteria_tests_datasd_v1.csv',
    replace=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

# Uploads the latest indicator bacteria tests.
upload_latest_indicator_bac_tests = S3FileTransferOperator(
    task_id='upload_latest_indicator_bac_tests',
    source_base_path=conf['prod_data_dir'],
    source_key='latest_indicator_bac_tests_datasd_v1.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='water_testing/latest_indicator_bac_tests_datasd_v1.csv',
    on_failure_callback=notify,
示例#12
0
#: Process and geocode raw special events file
process_special_events = PythonOperator(
    task_id='process_special_events',
    python_callable=process_special_events,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Process and geocode raw special events file
addresses_to_S3 = S3FileTransferOperator(
    task_id='upload_address_book',
    source_base_path=conf['prod_data_dir'],
    source_key='events_address_book.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['ref_s3_bucket'],
    dest_s3_key='events_address_book.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Upload prod SE file to S3
upload_special_events_web = S3FileTransferOperator(
    task_id='upload_special_events_web',
    source_base_path=conf['prod_data_dir'],
    source_key='special_events_list_datasd.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='special_events/special_events_list_datasd.csv',
    on_failure_callback=notify,
示例#13
0
    name_parts = file_name.split('_')

    if 'v1' in name_parts:
        name_parts.remove('datasd')
        name_parts.remove('v1')
        task_name = '_'.join(name_parts[3:-1])
        md_name = '-'.join(name_parts[3:-1])

        #: Upload prod gid file to S3
        upload_task = S3FileTransferOperator(
            task_id='upload_' + task_name,
            source_base_path=conf['prod_data_dir'],
            source_key='get_it_done_{}_requests_datasd_v1.csv'.format(
                task_name),
            dest_s3_conn_id=conf['default_s3_conn_id'],
            dest_s3_bucket=conf['dest_s3_bucket'],
            dest_s3_key='get_it_done_311/get_it_done_{}_requests_datasd_v1.csv'
            .format(task_name),
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
            replace=True,
            dag=dag)

        if task_name in services:
            for service_index, service in enumerate(services):
                if task_name == service:
                    #: Github .md update
                    service_update_task = get_seaboard_update_dag(
                        'gid-' + md_name + '.md', dag)
                    #: update json must run after the get task
                    upload_task.set_upstream(service_tasks[service_index])
示例#14
0
#: Process billing data
process_billing = PythonOperator(task_id='process_billing',
                                 python_callable=process_billing,
                                 on_failure_callback=notify,
                                 on_retry_callback=notify,
                                 on_success_callback=notify,
                                 dag=dag)

#: Upload billing data to S3
billing_to_S3 = S3FileTransferOperator(
    task_id='billing_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key=datasd[0],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='read/' + datasd[0],
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Retrieve READ leases data from FTP
get_leases = BashOperator(task_id='get_leases',
                          bash_command=get_leases(),
                          on_failure_callback=notify,
                          on_retry_callback=notify,
                          on_success_callback=notify,
                          dag=dag)

#: Process leases data
示例#15
0
#: Convert geojson to geobuf
shape_zip = PythonOperator(task_id='sidewalks_shape_to_zip',
                           python_callable=shp_to_zip,
                           on_failure_callback=notify,
                           on_retry_callback=notify,
                           on_success_callback=notify,
                           dag=dag)

#: Upload OCI file to S3
upload_oci_file = S3FileTransferOperator(
    task_id='upload_oci',
    source_base_path=conf['prod_data_dir'],
    source_key='sidewalk_cond_datasd_v1.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw/sidewalk_cond_datasd_v1.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Upload shp GIS file to S3
upload_shp_file = S3FileTransferOperator(
    task_id='sidewalks_shp_to_S3',
    source_base_path=conf['prod_data_dir'],
    source_key='sidewalks.zip',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw/sidewalks.zip',
    on_failure_callback=notify,
    task_id='build_prod_file',
    python_callable=build_prod_file,
    provide_context=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)


#: Uploads the generated production file
upload_prod_file = S3FileTransferOperator(
    task_id='upload_parking_full',
    source_base_path=conf['prod_data_dir'],
    source_key=flist['full'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='parking_meters/' + flist['full'],
    replace=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Builds by month aggregation
build_by_month_aggregation = PythonOperator(
    task_id='build_by_month_agg',
    python_callable=build_aggregation,
    op_kwargs={'agg_type': 'pole_by_month'},
    provide_context=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
                                dag=dag)

div_doc_table = PythonOperator(task_id='divide_doc_table',
                               python_callable=latest_res_ords,
                               on_failure_callback=notify,
                               on_retry_callback=notify,
                               on_success_callback=notify,
                               dag=dag)

upload_reso_ord = S3FileTransferOperator(
    task_id='upload_documentum_reso_ordinance_latest',
    source_base_path=conf['prod_data_dir'],
    source_key='documentum_scs_council_reso_ordinance_v_2016_current.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key=
    'city_docs/documentum_scs_council_reso_ordinance_v_2016_current.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Execution rules
#: documentum_docs_latest_only must run before get_doc_tables
get_doc_tables.set_upstream(documentum_docs_latest_only)
#: get_doc_tables must run before div_doc_table
div_doc_table.set_upstream(get_doc_tables)
#: get_doc_tables must run before upload_doc_tables
upload_reso_ord.set_upstream(div_doc_table)
    task_id='clean_files',
    provide_context=True,
    python_callable=clean_files,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag
    )

#: Uploads the generated production file
upload_prod_file = S3FileTransferOperator(
    task_id='upload_meter_locs',
    source_base_path=conf['prod_data_dir'],
    source_key='treas_parking_meters_loc_datasd_v1.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='parking_meters/treas_parking_meters_loc_datasd_v1.csv',
    replace=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

#: Update data inventory json
update_json_date = PythonOperator(
    task_id='update_json_date',
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'parking_meters_locations'},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
示例#19
0
                           dag=dag)

#: Process collisions data and save result to prod folder
process_hc_data = PythonOperator(task_id='process_data',
                                 python_callable=process_data,
                                 on_failure_callback=notify,
                                 on_retry_callback=notify,
                                 on_success_callback=notify,
                                 dag=dag)

#: Upload prod file to S3
hc_to_S3 = S3FileTransferOperator(task_id='prod_file_to_S3',
                                  source_base_path=conf['prod_data_dir'],
                                  source_key='hate_crimes_datasd.csv',
                                  dest_s3_bucket=conf['dest_s3_bucket'],
                                  dest_s3_conn_id=conf['default_s3_conn_id'],
                                  dest_s3_key='pd/hate_crimes_datasd.csv',
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)

#: Update data inventory json
update_hc_date = PythonOperator(task_id='update_json_date',
                                python_callable=update_json_date,
                                provide_context=True,
                                op_kwargs={'ds_fname': 'hate_crimes'},
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
                                dag=dag)
                                dag=dag)

process_public_art = PythonOperator(task_id='process_public_art',
                                    python_callable=process_public_art,
                                    on_failure_callback=notify,
                                    on_retry_callback=notify,
                                    on_success_callback=notify,
                                    dag=dag)

#: Upload prod art file to S3
upload_public_art = S3FileTransferOperator(
    task_id='upload_public_art',
    source_base_path=conf['prod_data_dir'],
    source_key='public_art_locations_datasd_v1.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='public_art/public_art_locations_datasd_v1.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Update data inventory json
update_json_date = PythonOperator(
    task_id='update_json_date',
    python_callable=update_json_date,
    provide_context=True,
    op_kwargs={'ds_fname': 'civic_art_collection'},
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
#: combine violations, process, output file
combine_sw_violations = PythonOperator(task_id='combine_sw_violations',
                                       python_callable=combine_violations,
                                       on_failure_callback=notify,
                                       on_retry_callback=notify,
                                       on_success_callback=notify,
                                       dag=dag)

#: Upload prod csv file to S3
violations_csv_to_s3 = S3FileTransferOperator(
    task_id='violations_csv_to_s3',
    source_base_path=conf['prod_data_dir'],
    source_key='stormwater_violations_merged.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw_int/stormwater_violations_merged.csv',
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    replace=True,
    dag=dag)

#: Upload prod csv with null geos file to S3
violations_csv_null_geos_to_s3 = S3FileTransferOperator(
    task_id='violations_csv_w_null_geos_to_s3',
    source_base_path=conf['prod_data_dir'],
    source_key='stormwater_violations_merged_null_geos.csv',
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_key='tsw_int/stormwater_violations_merged_null_geos.csv',
    on_failure_callback=notify,
#: Builds the prod file
build_traffic_counts = PythonOperator(task_id='build_traffic_counts',
                                      python_callable=build_traffic_counts,
                                      on_failure_callback=notify,
                                      on_retry_callback=notify,
                                      on_success_callback=notify,
                                      dag=dag)

#: Uploads the generated production file
upload_traffic_counts = S3FileTransferOperator(
    task_id='upload_traffic_counts',
    source_base_path=conf['prod_data_dir'],
    source_key='traffic_counts_datasd_v1.csv',
    dest_s3_bucket=conf['dest_s3_bucket'],
    dest_s3_conn_id=conf['default_s3_conn_id'],
    dest_s3_key='traffic_counts/traffic_counts_datasd_v1.csv',
    replace=True,
    on_failure_callback=notify,
    on_retry_callback=notify,
    on_success_callback=notify,
    dag=dag)

update_json_date = PythonOperator(task_id='update_json_date',
                                  python_callable=update_json_date,
                                  provide_context=True,
                                  op_kwargs={'ds_fname': 'traffic_volumes'},
                                  on_failure_callback=notify,
                                  on_retry_callback=notify,
                                  on_success_callback=notify,
                                  dag=dag)
示例#23
0
#: Get onbase tables
get_doc_tables = PythonOperator(task_id='get_onbase_tables',
                                python_callable=get_onbase,
                                on_failure_callback=notify,
                                on_retry_callback=notify,
                                on_success_callback=notify,
                                dag=dag)

files = [f for f in os.listdir(conf['prod_data_dir'])]
for f in files:
    file_name = f.split('.')[0]
    name_parts = file_name.split('_')
    if name_parts[0] == "onbase":
        #: Upload onbase prod files to S3
        upload_doc_tables = S3FileTransferOperator(
            task_id='upload_' + file_name,
            source_base_path=conf['prod_data_dir'],
            source_key='{}.csv'.format(file_name),
            dest_s3_conn_id=conf['default_s3_conn_id'],
            dest_s3_bucket=conf['dest_s3_bucket'],
            dest_s3_key='city_docs/{}.csv'.format(file_name),
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
            replace=True,
            dag=dag)

        #: get_doc_tables must run before upload_doc_tables
        upload_doc_tables.set_upstream(get_doc_tables)
def create_sde_tasks(dag,
                     folder,
                     layer,
                     datasd_name,
                     md,
                     path_to_file,
                     sde_to_shp):
    """Dynamically create SDE Airflow tasks.

    dag: DAG defined in _dags file.
    folder: subfolder in the sde folder on S3.
    layer: layer name.
    datasd_name: layer name + _datasd.
    md: name of md file on Seaboard.
    path_to_file: poseidon path + datasd_name.
    sde_to_shp: _jobs specific sde_to_shp function
    """
    #: Latest Only Operator for sde layer
    sde_latest_only = LatestOnlyOperator(task_id='{layer}_latest_only'
                                         .format(layer=layer),
                                         dag=dag)

    #: Convert sde table to shapefile format
    to_shp = PythonOperator(
        task_id='{layer}_to_shp'.format(layer=layer),
        python_callable=sde_to_shp,
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Convert shapefile to GeoJSON format
    to_geojson = BashOperator(
        task_id='{layer}_to_geojson'.format(layer=layer),
        bash_command=shp_to_geojson(path_to_file),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Convert shapefile to TopoJSON format
    to_topojson = BashOperator(
        task_id='{layer}_to_topojson'.format(layer=layer),
        bash_command=shp_to_topojson(path_to_file),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Compress shapefile components
    to_zip = PythonOperator(
        task_id='{layer}_shp_to_zip'.format(layer=layer),
        python_callable=shp_to_zip,
        op_kwargs={'datasd_name': datasd_name},
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        dag=dag)

    #: Upload shapefile to S3
    shp_to_S3 = S3FileTransferOperator(
        task_id='{layer}_shp_to_S3'.format(layer=layer),
        source_base_path=conf['prod_data_dir'],
        source_key='{datasd_name}.zip'.format(datasd_name=datasd_name),
        dest_s3_conn_id=conf['default_s3_conn_id'],
        dest_s3_bucket=conf['dest_s3_bucket'],
        dest_s3_key='sde/{folder}/{datasd_name}.zip'
                    .format(folder=folder, datasd_name=datasd_name),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)

    #: Upload geojson to S3
    geojson_to_S3 = S3FileTransferOperator(
        task_id='{layer}_geojson_to_S3'.format(layer=layer),
        source_base_path=conf['prod_data_dir'],
        source_key='{datasd_name}.geojson'.format(datasd_name=datasd_name),
        dest_s3_conn_id=conf['default_s3_conn_id'],
        dest_s3_bucket=conf['dest_s3_bucket'],
        dest_s3_key='sde/{folder}/{datasd_name}.geojson'
                    .format(folder=folder, datasd_name=datasd_name),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)

    #: Upload topojson to S3
    topojson_to_S3 = S3FileTransferOperator(
        task_id='{layer}_topojson_to_S3'.format(layer=layer),
        source_base_path=conf['prod_data_dir'],
        source_key='{datasd_name}.topojson'.format(datasd_name=datasd_name),
        dest_s3_conn_id=conf['default_s3_conn_id'],
        dest_s3_bucket=conf['dest_s3_bucket'],
        dest_s3_key='sde/{folder}/{datasd_name}.topojson'
                    .format(folder=folder, datasd_name=datasd_name),
        on_failure_callback=notify,
        on_retry_callback=notify,
        on_success_callback=notify,
        replace=True,
        dag=dag)

    #: Update portal modified date
    update_md = get_seaboard_update_dag('{md}.md'.format(md=md), dag)

    if layer not in no_pbf:
        #: Convert GeoJSON to Geobuf format
        to_geobuf = PythonOperator(
            task_id='{layer}_to_geobuf'.format(layer=layer),
            python_callable=geojson_to_geobuf,
            op_kwargs={'path_to_file': path_to_file},
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
            dag=dag)

        #: Convert geobuf to gzipped geobuf
        to_gzip = PythonOperator(
            task_id='{layer}_geobuf_to_gzip'.format(layer=layer),
            python_callable=geobuf_to_gzip,
            op_kwargs={'datasd_name': datasd_name},
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
            dag=dag)

        #: Upload geobuf to S3
        geobuf_to_S3 = S3FileTransferOperator(
            task_id='{layer}_geobuf_to_S3'.format(layer=layer),
            source_base_path=conf['prod_data_dir'],
            source_key='{datasd_name}.pbf'.format(datasd_name=datasd_name),
            dest_s3_conn_id=conf['default_s3_conn_id'],
            dest_s3_bucket=conf['dest_s3_bucket'],
            dest_s3_key='sde/{folder}/{datasd_name}.pbf'
                        .format(folder=folder, datasd_name=datasd_name),
            on_failure_callback=notify,
            on_retry_callback=notify,
            on_success_callback=notify,
            replace=True,
            use_gzip=True,
            dag=dag)

        #: Conversion to geobuf is triggered after conversion to geojson.
        to_geobuf.set_upstream(to_geojson)

        #: Compression to gzip is triggered after conversion to geobuf.
        to_gzip.set_upstream(to_geobuf)

        #: geobuf upload to S3 is triggered after compression to gzipped geobuf.
        geobuf_to_S3.set_upstream(to_gzip)

        #: Github update depends on shapefile S3 upload success.
        update_md.set_upstream(geobuf_to_S3)

    #: Execution rules:
    #: sde_latest_only must run before shp conversion.
    to_shp.set_upstream(sde_latest_only)

    #: Conversion to geojson is triggered after conversion to shp.
    to_geojson.set_upstream(to_shp)

    #: Conversion to topojson is triggered after conversion to shapefile.
    to_topojson.set_upstream(to_shp)

    #: Compression to zip is triggered after conversion to geojson and topojson.
    to_zip.set_upstream(to_geojson)
    to_zip.set_upstream(to_topojson)

    #: shapefile upload to S3 is triggered after conversion to zip.
    shp_to_S3.set_upstream(to_zip)

    #: geojson upload to S3 is triggered after conversion to geojson.
    geojson_to_S3.set_upstream(to_geojson)

    #: topojson upload to S3 is triggered after conversion to topojson.
    topojson_to_S3.set_upstream(to_topojson)

    #: Github update depends on shapefile S3 upload success.
    update_md.set_upstream(shp_to_S3)
    update_md.set_upstream(geojson_to_S3)
    update_md.set_upstream(topojson_to_S3)