FROM tmp.moving_violations a INNER JOIN source_data.mv_geocodes ON source_data.mv_geocodes.location = a.location WHERE a.latitude IS NULL and a.longitude IS NULL """ engine.execute(step_2_query) print("lat and long values updated") # create geography field with populated lat and long values, and create index step_3_query = """ UPDATE tmp.moving_violations_add_geocode SET geography = ST_SetSRID(ST_MakePoint(longitude_2, latitude_2), 4326)::geography WHERE geography IS NULL AND longitude_2 is not null AND latitude_2 is not null ; CREATE INDEX IF NOT EXISTS mv_geom_idx ON tmp.moving_violations_add_geocode USING GIST (geography); """ engine.execute(step_3_query) print("geography field created") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='moving_violations_nbh_ward', from_schema='tmp', from_table='moving_violations_add_geocode', partition_by_field='objectid') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='moving_violations_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") next_tables = add_roadway_info(engine=engine, target_schema='tmp', target_table='moving_violations_roadway_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance= 10) print("roadway info query complete") next_tables = add_intersection_info(engine=engine, target_schema='tmp', target_table='moving_violations_intersection_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance= 20) print("intersection info query complete") row_count = create_final_table(engine=engine, target_schema = 'analysis_data', target_table='moving_violations', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ",row_count)
env = "DEV" engine = create_postgres_engine(destination="AWS_PostGIS", target_db=dbname, env=env) db_credentials = get_connection_strings("AWS_PostGIS") engine.execute( 'update source_data.census_blocks set geography = ST_MakeValid(geography::geometry)::geography' ) print('geography updated') # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='census_blocks_nbh_ward', from_schema='source_data', from_table='census_blocks', partition_by_field='objectid') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='census_blocks_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='census_blocks', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)
def generate_pulsepoint_analysis_table(AWS_Credentials: dict, **kwargs): # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # set up RDS and S3 connections, engines, cursors region = AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) # flag that some records might be duplicate calls for the same incident dupe_check_query = """ DROP TABLE IF EXISTS tmp.pulsepoint_dupe_check; CREATE TABLE tmp.pulsepoint_dupe_check AS ( SELECT DISTINCT a.* , case when b.incident_id is null then 1 when a.num_units_responding = 0 and b.num_units_responding >0 then 0 when b.unit_status_transport > a.unit_status_transport then 0 when b.num_units_responding > a.num_units_responding then 0 when b.call_received_datetime < a.call_received_datetime then 0 else 1 end as KEEP_RECORD_FLAG FROM source_data.pulsepoint a LEFT JOIN source_data.pulsepoint b on a.incident_id <> b.incident_id and date_part('day', a.call_received_datetime - b.call_received_datetime) = 0 and date_part('hour', a.call_received_datetime - b.call_received_datetime) = 0 and date_part('month', a.call_received_datetime - b.call_received_datetime) = 0 and abs(date_part('minute', a.call_received_datetime - b.call_received_datetime)) <=20 and ST_DWithin(a.geography, b.geography, 100) and a.Agency_ID = b.Agency_ID and (a.num_units_responding = 0 or a.unit_ids && b.unit_ids) ) ; CREATE INDEX IF NOT EXISTS pulsepoint_dupe_check_geom_idx ON tmp.pulsepoint_dupe_check USING GIST (geography); """ # then join to the crashes table crashes_join_query = """ DROP TABLE IF EXISTS tmp.pulsepoint_crash_join; CREATE TABLE tmp.pulsepoint_crash_join AS (SELECT * FROM ( SELECT DISTINCT a.* ,concat(a.agency_id, a.incident_id) as Agency_Incident_ID ,b.objectid as Crash_Objectid ,b.geography as Crash_Geo ,b.total_injuries as Crash_Total_Injuries ,b.total_major_injuries as Crash_Total_Major_Injuries ,b.total_minor_injuries as Crash_Total_Minor_Injuries ,(b.bicycle_fatalities + b.pedestrian_fatalities + b.vehicle_fatalities) as Crash_Total_Fatalities ,b.bicycle_injuries as Crash_Bike_Injuries ,b.vehicle_injuries as Crash_Car_Injuries ,b.pedestrian_injuries as Crash_Ped_Injuries ,case when b.total_injuries is null or b.total_injuries < a.unit_status_transport then 1 else 0 end as injuries_mismatch ,ST_Distance(a.geography, b.geography) as Distance_To_Crash ,(b.reportdate at time zone 'America/New_York') - (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York') as Time_Between_Crash_And_Report ,b.intersectionid as Crash_Intersection_ID ,b.block_objectid as Crash_Block_Objectid ,Row_Number() over (partition by a.incident_id, a.agency_id order by ST_Distance(a.geography, b.geography)) as Crash_Distance_Rank ,Row_Number() over (partition by a.incident_id, a.agency_id order by (b.reportdate at time zone 'America/New_York') - (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York')) as Crash_Time_Rank FROM tmp.pulsepoint_dupe_check a LEFT JOIN analysis_data.dc_crashes_w_details b on ST_DWITHIN(a.geography, b.geography, 200) AND cast(b.fromdate as date) =cast((call_received_datetime at time zone 'America/New_York') as date) AND (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York') < (b.reportdate at time zone 'America/New_York') WHERE a.KEEP_RECORD_FLAG = 1 ) tmp WHERE Crash_Distance_Rank = 1 and (incident_type in ('TC', 'TCE', 'TCS') or (agency_id = '16000' and incident_type in ('TC', 'TCS', 'TCE', 'RES'))) ) ; CREATE INDEX IF NOT EXISTS pulsepoint_crash_join_geom_idx ON tmp.pulsepoint_crash_join USING GIST (geography); alter table tmp.pulsepoint_crash_join drop column KEEP_RECORD_FLAG; alter table tmp.pulsepoint_crash_join drop column Crash_Distance_Rank; """ # First execute the table-specific queries engine.execute(dupe_check_query) print("dupe check query complete") engine.execute(crashes_join_query) print("join to crashes query complete") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='pulsepoint_nbh_ward', from_schema='tmp', from_table='pulsepoint_crash_join', partition_by_field='Agency_Incident_ID') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='pulsepoint_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") next_tables = add_walkscore_info(engine=engine, target_schema='tmp', target_table='pulsepoint_walkscore', from_schema=next_tables[0], from_table=next_tables[1]) print("walkscore query complete") next_tables = add_roadway_info(engine=engine, target_schema='tmp', target_table='pulsepoint_roadway_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='Agency_Incident_ID', within_distance=100) print("roadway info query complete") next_tables = add_intersection_info( engine=engine, target_schema='tmp', target_table='pulsepoint_intersection_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='Agency_Incident_ID', within_distance=60) print("intersection info query complete") next_tables = is_national_park(engine=engine, target_schema='tmp', target_table='pulsepoint_national_park', from_schema=next_tables[0], from_table=next_tables[1]) print("national parks info query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='pulsepoint', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)
def generate_crashes_table(AWS_Credentials: dict, **kwargs): # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # set up RDS and S3 connections, engines, cursors region = AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) # The queries that are specific to the crash data and are not run anywhere else add_columns_query = """ DROP TABLE IF EXISTS tmp.crash_details; CREATE TABLE tmp.crash_details AS ( SELECT * ,CASE WHEN PERSONTYPE = 'Driver' AND AGE >=65 THEN 1 ELSE 0 END AS DRIVERS_OVER_65 ,CASE WHEN PERSONTYPE = 'Driver' AND AGE <=25 THEN 1 ELSE 0 END AS DRIVERS_UNDER_25 ,CASE WHEN PERSONTYPE = 'Pedestrian' AND AGE >=65 THEN 1 ELSE 0 END AS PEDS_OVER_65 ,CASE WHEN PERSONTYPE = 'Pedestrian' AND AGE <=12 THEN 1 ELSE 0 END AS PEDS_UNDER_12 ,CASE WHEN PERSONTYPE = 'Bicyclist' AND AGE >=65 THEN 1 ELSE 0 END AS BIKERS_OVER_65 ,CASE WHEN PERSONTYPE = 'Bicyclist' AND AGE <=18 THEN 1 ELSE 0 END AS BIKERS_UNDER_18 ,CASE WHEN PERSONTYPE = 'Driver' AND LICENSEPLATESTATE <> 'DC' AND LICENSEPLATESTATE <> ' None' THEN 1 ELSE 0 END AS OOS_VEHICLES ,CASE WHEN PERSONTYPE = 'Driver' AND INVEHICLETYPE = 'Passenger Car/automobile' THEN 1 ELSE 0 END AS NUM_CARS ,CASE WHEN PERSONTYPE = 'Driver' AND INVEHICLETYPE in ('Suv (sport Utility Vehicle)', 'Pickup Truck') THEN 1 ELSE 0 END AS NUM_SUVS_OR_TRUCKS ,CASE WHEN PERSONTYPE = 'Pedestrian' AND FATAL='Y' THEN 1 ELSE 0 END AS PED_FATALITIES ,CASE WHEN PERSONTYPE = 'Bicyclist' AND FATAL='Y'THEN 1 ELSE 0 END AS BICYCLE_FATALITIES ,CASE WHEN PERSONTYPE in ('Driver','Passenger') AND FATAL='Y' THEN 1 ELSE 0 END AS VEHICLE_FATALITIES ,CASE WHEN PERSONTYPE = 'Pedestrian' AND (MAJORINJURY='Y' OR MINORINJURY ='Y')THEN 1 ELSE 0 END AS PED_INJURIES ,CASE WHEN PERSONTYPE = 'Bicyclist' AND (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS BICYCLE_INJURIES ,CASE WHEN PERSONTYPE in ('Driver','Passenger') AND (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS VEHICLE_INJURIES ,CASE WHEN PERSONTYPE = 'Driver' AND TICKETISSUED ='Y' THEN 1 ELSE 0 END AS DRIVER_TICKETS ,CASE WHEN PERSONTYPE = 'Driver' AND SPEEDING ='Y' THEN 1 ELSE 0 END AS DRIVERS_SPEEDING ,CASE WHEN PERSONTYPE = 'Driver' AND IMPAIRED ='Y' THEN 1 ELSE 0 END AS DRIVERS_IMPAIRED ,CASE WHEN PERSONTYPE = 'Bicyclist' AND TICKETISSUED ='Y' THEN 1 ELSE 0 END AS BICYCLE_TICKETS ,CASE WHEN PERSONTYPE = 'Pedestrian' AND TICKETISSUED ='Y' THEN 1 ELSE 0 END AS PED_TICKETS ,CASE WHEN (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS TOTAL_INJURIES ,CASE WHEN MAJORINJURY='Y' THEN 1 ELSE 0 END AS TOTAL_MAJOR_INJURIES ,CASE WHEN MINORINJURY ='Y' THEN 1 ELSE 0 END AS TOTAL_MINOR_INJURIES ,CASE WHEN PERSONTYPE = 'Driver' THEN 1 ELSE 0 END AS TOTAL_VEHICLES ,CASE WHEN PERSONTYPE = 'Pedestrian' THEN 1 ELSE 0 END AS TOTAL_PEDESTRIANS ,CASE WHEN PERSONTYPE = 'Bicyclist' THEN 1 ELSE 0 END AS TOTAL_BICYCLISTS FROM source_data.crash_details ) """ group_by_query = """ DROP TABLE IF EXISTS tmp.crash_details_agg; CREATE TABLE tmp.crash_details_agg AS ( SELECT CRIMEID ,SUM(DRIVERS_OVER_65) AS DRIVERS_OVER_65 ,SUM(DRIVERS_UNDER_25) AS DRIVERS_UNDER_25 ,SUM(PEDS_OVER_65) AS PEDS_OVER_65 ,SUM(PEDS_UNDER_12) AS PEDS_UNDER_12 ,SUM(BIKERS_OVER_65) AS BIKERS_OVER_65 ,SUM(BIKERS_UNDER_18) AS BIKERS_UNDER_18 ,SUM(OOS_VEHICLES) AS OOS_VEHICLES ,SUM(NUM_CARS) AS NUM_CARS ,SUM(NUM_SUVS_OR_TRUCKS) AS NUM_SUVS_OR_TRUCKS ,SUM(PED_INJURIES) AS PEDESTRIAN_INJURIES ,SUM(BICYCLE_INJURIES) AS BICYCLE_INJURIES ,SUM(VEHICLE_INJURIES) AS VEHICLE_INJURIES ,SUM(PED_FATALITIES) AS PEDESTRIAN_FATALITIES ,SUM(BICYCLE_FATALITIES) AS BICYCLE_FATALITIES ,SUM(VEHICLE_FATALITIES) AS VEHICLE_FATALITIES ,SUM(DRIVER_TICKETS) AS DRIVER_TICKETS ,SUM(DRIVERS_SPEEDING) AS DRIVERS_SPEEDING ,SUM(DRIVERS_IMPAIRED) AS DRIVERS_IMPAIRED ,SUM(BICYCLE_TICKETS) AS BICYCLE_TICKETS ,SUM(PED_TICKETS) AS PED_TICKETS ,SUM(TOTAL_INJURIES) AS TOTAL_INJURIES ,SUM(TOTAL_MAJOR_INJURIES) AS TOTAL_MAJOR_INJURIES ,SUM(TOTAL_MINOR_INJURIES) AS TOTAL_MINOR_INJURIES ,SUM(TOTAL_VEHICLES) AS TOTAL_VEHICLES ,SUM(TOTAL_PEDESTRIANS) AS TOTAL_PEDESTRIANS ,SUM(TOTAL_BICYCLISTS) AS TOTAL_BICYCLISTS ,ARRAY_AGG(PERSONTYPE) AS PERSONTYPE_ARRAY ,ARRAY_AGG(INVEHICLETYPE) AS INVEHICLETYPE_ARRAY ,ARRAY_AGG(LICENSEPLATESTATE) AS LICENSEPLATESTATE_ARRAY FROM tmp.crash_details GROUP BY CRIMEID ) ; create index crime_id on tmp.crash_details_agg (crimeid); """ join_query = """ DROP TABLE IF EXISTS tmp.crashes_join; CREATE TABLE tmp.crashes_join AS ( SELECT a.OBJECTID ,a.CRIMEID ,a.REPORTDATE ,a.FROMDATE ,a.TODATE ,a.ADDRESS ,a.mpdlatitude ,a.mpdlongitude ,CASE WHEN b.CRIMEID IS NULL OR b.BICYCLE_INJURIES < (a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) THEN (a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) ELSE b.BICYCLE_INJURIES END AS BICYCLE_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.VEHICLE_INJURIES < (a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER) THEN (a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER) ELSE b.VEHICLE_INJURIES END AS VEHICLE_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.PEDESTRIAN_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN) THEN (a.MAJORINJURIES_PEDESTRIAN + a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN) ELSE b.PEDESTRIAN_INJURIES END AS PEDESTRIAN_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN +a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER +a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) THEN (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN +a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER +a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) ELSE b.TOTAL_INJURIES end as TOTAL_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_MAJOR_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+ +a.MAJORINJURIES_DRIVER+a.MAJORINJURIESPASSENGER +a.MAJORINJURIES_BICYCLIST) THEN (a.MAJORINJURIES_PEDESTRIAN+a.MAJORINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MAJORINJURIES_BICYCLIST) ELSE b.TOTAL_MAJOR_INJURIES end as TOTAL_MAJOR_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_MINOR_INJURIES < (a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN +a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER +a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) THEN (a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN +a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER +a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) ELSE b.TOTAL_MINOR_INJURIES end as TOTAL_MINOR_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.BICYCLE_FATALITIES < a.FATAL_BICYCLIST THEN a.FATAL_BICYCLIST ELSE b.BICYCLE_FATALITIES END AS BICYCLE_FATALITIES ,CASE WHEN b.CRIMEID IS NULL OR b.PEDESTRIAN_FATALITIES < a.FATAL_PEDESTRIAN THEN a.FATAL_PEDESTRIAN ELSE b.PEDESTRIAN_FATALITIES END AS PEDESTRIAN_FATALITIES ,CASE WHEN b.CRIMEID IS NULL OR b.VEHICLE_FATALITIES < (a.FATAL_DRIVER+a.FATALPASSENGER) THEN (a.FATAL_DRIVER+a.FATALPASSENGER) ELSE b.VEHICLE_FATALITIES END AS VEHICLE_FATALITIES ,CASE WHEN b.CRIMEID IS NULL or b.DRIVERS_IMPAIRED < a.DRIVERSIMPAIRED THEN a.DRIVERSIMPAIRED ELSE b.DRIVERS_IMPAIRED END AS DRIVERS_IMPAIRED ,CASE WHEN b.CRIMEID IS NULL or b.DRIVERS_SPEEDING < a.SPEEDING_INVOLVED THEN a.SPEEDING_INVOLVED ELSE b.DRIVERS_SPEEDING END AS DRIVERS_SPEEDING ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_VEHICLES < a.TOTAL_VEHICLES THEN a.TOTAL_VEHICLES ELSE b.TOTAL_VEHICLES END AS TOTAL_VEHICLES ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_BICYCLISTS < a.TOTAL_BICYCLES THEN a.TOTAL_BICYCLES ELSE b.TOTAL_BICYCLISTS END AS TOTAL_BICYCLISTS ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_PEDESTRIANS < a.TOTAL_PEDESTRIANS THEN a.TOTAL_PEDESTRIANS ELSE b.TOTAL_PEDESTRIANS END AS TOTAL_PEDESTRIANS ,b.DRIVERS_OVER_65 ,b.DRIVERS_UNDER_25 ,b.PEDS_OVER_65 ,b.PEDS_UNDER_12 ,b.BIKERS_OVER_65 ,b.BIKERS_UNDER_18 ,b.OOS_VEHICLES ,b.NUM_CARS ,b.NUM_SUVS_OR_TRUCKS ,b.DRIVER_TICKETS ,b.BICYCLE_TICKETS ,b.PED_TICKETS ,b.PERSONTYPE_ARRAY ,b.INVEHICLETYPE_ARRAY ,b.LICENSEPLATESTATE_ARRAY ,a.INTAPPROACHDIRECTION ,a.LOCATIONERROR ,a.LASTUPDATEDATE ,a.BLOCKKEY ,a.SUBBLOCKKEY ,ST_Force2D(a.geography::geometry) as geography FROM source_data.crashes_raw a LEFT JOIN tmp.crash_details_agg b on a.CRIMEID = b.CRIMEID WHERE date_part('year', a.fromdate) >=2015 ) ; CREATE INDEX crashes_geom_idx ON tmp.crashes_join USING GIST (geography); """ # join in the pulsepoint info pulsepoint_join_query = """ DROP TABLE IF EXISTS tmp.crash_pulsepoint_join; CREATE TABLE tmp.crash_pulsepoint_join AS (SELECT * FROM ( SELECT DISTINCT a.* ,b.Agency_Incident_ID as pp_agency_incident_id ,b.unit_status_transport as pp_total_injuries ,b.transport_unit_is_amr as pp_total_minor_injuries ,b.transport_unit_is_non_amr as pp_total_major_injuries ,Row_Number() over (partition by a.objectid order by ST_Distance(a.geography, b.geography)) as PP_Call_Distance_Rank ,Row_Number() over (partition by a.objectid order by (a.reportdate at time zone 'America/New_York') - (b.CALL_RECEIVED_DATETIME at time zone 'America/New_York')) as PP_Call_Time_Rank FROM tmp.crashes_join a LEFT JOIN analysis_data.pulsepoint b on ST_DWITHIN(a.geography, b.geography, 200) AND cast(fromdate as date) =cast((call_received_datetime at time zone 'America/New_York') as date) AND (b.CALL_RECEIVED_DATETIME at time zone 'America/New_York') < (a.reportdate at time zone 'America/New_York') ) tmp WHERE PP_Call_Distance_Rank = 1 ) ; CREATE INDEX IF NOT EXISTS crash_pulsepoint_join_geom_idx ON tmp.crash_pulsepoint_join USING GIST (geography); alter table tmp.crash_pulsepoint_join drop column PP_Call_Distance_Rank; """ # First execute the table-specific queries engine.execute(add_columns_query) print("add columns query complete") engine.execute(group_by_query) print("group by query complete") engine.execute(join_query) print("join query complete") engine.execute(pulsepoint_join_query) print("pulsepoint join query complete") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='crashes_nbh_ward', from_schema='tmp', from_table='crash_pulsepoint_join', partition_by_field='objectid') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='crashes_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") next_tables = add_walkscore_info(engine=engine, target_schema='tmp', target_table='crashes_walkscore', from_schema=next_tables[0], from_table=next_tables[1]) print("walkscore query complete") next_tables = add_roadway_info(engine=engine, target_schema='tmp', target_table='crashes_roadway_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=0.001) print("roadway info query complete") next_tables = add_intersection_info( engine=engine, target_schema='tmp', target_table='crashes_intersection_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=10) print("intersection info query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='dc_crashes_w_details', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)
print("dupe check query complete") engine.execute(crashes_join_query) print("join to crashes query complete") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='pulsepoint_nbh_ward', from_schema='tmp', from_table='pulsepoint_crash_join', partition_by_field='Agency_Incident_ID') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='pulsepoint_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") next_tables = add_walkscore_info(engine=engine, target_schema='tmp', target_table='pulsepoint_walkscore', from_schema=next_tables[0], from_table=next_tables[1]) print("walkscore query complete") next_tables = add_roadway_info(engine=engine, target_schema='tmp', target_table='pulsepoint_roadway_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='Agency_Incident_ID',
LEFT JOIN analysis_data.roadway_blocks b on ST_DWithin(a.geography, b.geography, 0.001) GROUP BY a.intersectionid, a.intersection_type, a.geography, a.street_names ) ; CREATE INDEX tmp_intersection_points_road_types_index ON tmp.intersection_points_road_types USING GIST (geography); """ # First execute the table-specific queries engine.execute(step1_query) print("step1 query complete") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='intersection_points_nbh_ward', from_schema='tmp', from_table='intersection_points_road_types', partition_by_field='intersectionid') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='intersection_points_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='intersection_points', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)
,ST_SetSRID(ST_MakePoint(cast(xcoord as numeric), cast(ycoord as numeric)), 4326)::geography as geography FROM source_data.metro_stations_daily_ridership WHERE station is not null ) """ # First execute the table-specific queries engine.execute(step1_query) print("step1 query complete") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info( engine=engine, target_schema='tmp', target_table='metro_stations_ridership_nbh_ward', from_schema='tmp', from_table='metro_stations_ridership', partition_by_field='objectid') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='metro_stations_ridership_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='metro_stations_ridership', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)
def generate_moving_violations_table(AWS_Credentials: dict, **kwargs): # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # set up RDS and S3 connections, engines, cursors region = AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) # First move all source data records to a temp table step_1_query = """ CREATE TABLE IF NOT EXISTS tmp.moving_violations_need_geo as SELECT * FROM source_data.moving_violations WHERE geography IS NULL; CREATE TABLE IF NOT EXISTS tmp.moving_violations_has_geo as SELECT * FROM source_data.moving_violations WHERE geography IS NOT NULL; CREATE INDEX IF NOT EXISTS mv_location_index ON tmp.moving_violations_need_geo (location); """ engine.execute(step_1_query) print("temp table created") # geocode the locations records = [ loc for (loc, ) in engine.execute( "select distinct location from tmp.moving_violations_need_geo where geography is null limit 2000" ).fetchall() ] print(len(records), " records passed to geocode function") geocode_text(engine=engine, records_to_geocode=records, administrative_area='District of Columbia', text_type='Moving Violations location') # update lat and long values from new data step_2_query = """ UPDATE tmp.moving_violations_need_geo SET geography = source_data.geocoded_text.point_geography FROM source_data.geocoded_text WHERE source_data.geocoded_text.text = location ; INSERT INTO tmp.moving_violations_has_geo SELECT * FROM tmp.moving_violations_need_geo; CREATE INDEX IF NOT EXISTS mv_geom_idx ON tmp.moving_violations_has_geo USING GIST (geography); """ engine.execute(step_2_query) print("geo values updated") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='moving_violations_nbh_ward', from_schema='tmp', from_table='moving_violations_has_geo', partition_by_field='objectid') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='moving_violations_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") next_tables = add_roadway_info( engine=engine, target_schema='tmp', target_table='moving_violations_roadway_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=50) print("roadway info query complete") next_tables = add_intersection_info( engine=engine, target_schema='tmp', target_table='moving_violations_intersection_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=20) print("intersection info query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='moving_violations', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)