def extract_scanner_audio_json(target_schema: str, source_table: str, target_table: str, AWS_Credentials: dict, **kwargs): # assign optional arguments source_schema = kwargs.get('source_schema', None) if source_schema == None: source_schema = 'stg' # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # set up RDS and S3 connections, engines, cursors region = AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) # extract the json info step1_query = """ DROP TABLE IF EXISTS tmp.citizen; CREATE TABLE tmp.citizen AS ( WITH results AS ( SELECT jsonb_array_elements(test.results) AS data, source_file, load_datetime FROM ( SELECT data->'results' as results, source_file, load_datetime from {0}."{1}" ) as test ) SELECT to_timestamp(TRUNC((data->'cs')::bigint)/1000) AS cs ,(data->'ll'->0)::numeric AS lat ,(data->'ll'->1)::numeric AS long ,to_timestamp(TRUNC((data->'ts')::bigint)/1000) AS ts ,(data->'key')::varchar as incident_key ,(data->'raw')::varchar as incident_desc_raw ,(data->'source')::varchar as incident_source ,(data->'categories') as incident_categories ,source_file ,load_datetime ,data FROM results ); """.format(source_schema, source_table) step_2_query = """ DROP TABLE IF EXISTS tmp.citizen_geometry; CREATE TABLE tmp.citizen_geometry AS ( SELECT *, ST_SetSRID(ST_MakePoint(long, lat),4326)::geography as geography FROM tmp.citizen ) ; """ final_query = """ CREATE TABLE IF NOT EXISTS {0}.{1} (LIKE tmp.citizen_geometry); INSERT INTO {0}.{1} SELECT * FROM tmp.citizen_geometry; GRANT ALL PRIVILEGES ON {0}.{1} TO PUBLIC; """.format(target_schema, target_table) engine.execute(step1_query) engine.execute(step_2_query) engine.execute(final_query) count_query = 'SELECT COUNT(*) FROM {}.{} WHERE source_file like \'%%{}%%\''.format( target_schema, target_table, source_table) row_count = engine.execute(count_query).fetchone()[0] print("{} rows inserted into final table from file {}".format( str(row_count), source_table)) drop_table_query = 'DROP TABLE IF EXISTS {}."{}"'.format( source_schema, source_table) engine.execute(drop_table_query)
def s3_to_postGIS(folder_to_load: str, AWS_Credentials: dict, format: str, header: str, mode: str, move_after_loading: str, move_to_folder: str): # set up S3 and RDS connections s3_resource = boto3.resource( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) bucket_name = AWS_Credentials['s3_bucket'] bucket = s3_resource.Bucket(bucket_name) region = AWS_Credentials['region'] dbname = 'postgres' env = "DEV" engine = create_postgres_engine(destination="AWS_PostGIS", target_db=dbname, env=env) db_credentials = get_connection_strings("AWS_PostGIS") db_uid = db_credentials[env]['UID'] db_pwd = db_credentials[env]['PWD'] db_host = db_credentials[env]['HOST'] db_port = db_credentials[env]['PORT'] # add psql install location to default path psql_path = subprocess.check_output(['which', 'psql']).strip().decode('utf-8') sys.path.append(psql_path) # grab list of all files in target folder that have a target table # url encode the file key so the ones with semicolons don't throw an error # update january 2021: the script now throws an error if i try to feed it URL-encoded object keys, so just using the plain text one now files_to_load = [(urllib.parse.quote(obj.key), obj.key, obj.Object().metadata['target_schema'], obj.Object().metadata['target_table']) for obj in bucket.objects.filter(Prefix=folder_to_load) if 'target_table' in obj.Object().metadata.keys() if format in obj.key] # generate distinct list of target tables so they're all only dropped and recreated/truncated one time target_tables = [(target_schema, target_table) for (file_name, file_name_native, target_schema, target_table) in files_to_load] target_tables_distinct = set(target_tables) target_tables = list(target_tables_distinct) # drop and recreate and/or truncate each target table for (target_schema, target_table) in target_tables: generate_table(engine=engine, target_schema=target_schema, target_table=target_table, mode=mode) # set table import parameters that are the same for every file copy_parameters = '\'(FORMAT {}, HEADER {})\''.format(format, header) columns_to_copy = '\'\'' aws_credentials_param = '\'{}\', \'{}\',\'\''.format( AWS_Credentials['aws_access_key_id'], AWS_Credentials['aws_secret_access_key']) # create file-specific table import parameters for (file_name, file_name_native, target_schema, target_table) in files_to_load: destination_table = '\'{}.{}\''.format(target_schema, target_table) create_s3_uri_param = '\'{}\', \'{}\',\'{}\''.format( AWS_Credentials['s3_bucket'], file_name_native, region) base_file_name = os.path.basename(file_name_native) # create import statement import_table_query = 'SELECT aws_s3.table_import_from_s3({}, {},{}, aws_commons.create_s3_uri({}) ,aws_commons.create_aws_credentials({}));'.format( destination_table, columns_to_copy, copy_parameters, create_s3_uri_param, aws_credentials_param) # create arg to pass to os.system os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={} --no-password --command=\"{}\"'.format( db_pwd, db_host, db_port, db_uid, dbname, import_table_query) # execute if move_after_loading != 'yes': os.system(os_system_arg) elif move_after_loading == 'yes' and move_to_folder != '': os.system(os_system_arg) try: s3_resource.Object(bucket_name, move_to_folder + base_file_name).copy_from( CopySource={ 'Bucket': bucket_name, 'Key': file_name_native }) s3_resource.Object(bucket_name, file_name_native).delete() except: print(file_name_native, " could not be copied and/or deleted") continue else: print("please provide move-to folder") continue # after data is loaded, update the geographies for (target_schema, target_table) in target_tables: correct_geo(engine=engine, target_schema=target_schema, target_table=target_table, mode=mode)
import sqlalchemy from connect_to_rds import get_connection_strings, create_postgres_engine from add_location_info import add_location_info, add_school_info, create_final_table dbname = 'postgres' env = "DEV" engine = create_postgres_engine(destination="AWS_PostGIS", target_db=dbname, env=env) db_credentials = get_connection_strings("AWS_PostGIS") geography_levels = { 'comp_plan_area': { 'geo_boundaries_source_table': 'source_data.comp_plan_areas', 'orig_field_name': 'name' }, 'census_tract': { 'geo_boundaries_source_table': 'source_data.census_tracts', 'orig_field_name': 'tract' }, 'nbh_cluster_names': { 'geo_boundaries_source_table': 'source_data.neighborhood_clusters', 'orig_field_name': 'nbh_names' }, 'ward_name': { 'geo_boundaries_source_table': 'source_data.ward_boundaries', 'orig_field_name': 'name' }, 'anc_id': { 'geo_boundaries_source_table': 'source_data.anc_boundaries', 'orig_field_name': 'anc_id'
drop_table_query = 'DROP TABLE IF EXISTS {}."{}"'.format( source_schema, source_table) engine.execute(drop_table_query) CLI = argparse.ArgumentParser() CLI.add_argument("--env", type=str) CLI.add_argument("--source_schema", type=str) # parse the command line args = CLI.parse_args() env = args.env source_schema = args.source_schema if __name__ == "__main__": if env == None: env = 'DEV' env = env.upper() # tables_to_extract = json_to_postGIS(folder_to_load='source-data/citizen/unparsed/', move_to_folder = 'source-data/citizen/loaded_to_postgis/', AWS_Credentials=get_connection_strings("AWS_DEV")) engine = create_postgres_engine(destination="AWS_PostGIS", env=env) tables_to_extract = [ r for (r, ) in engine.execute( "select distinct table_name from information_schema.tables where table_schema = 'stg' and table_name like '%%transcribed_audio%%'" ) ] for table in tables_to_extract: extract_citizen_json(source_table=table, target_table='citizen_stream', target_schema='source_data', AWS_Credentials=get_connection_strings("AWS_DEV"), env=env)
def generate_crashes_table(AWS_Credentials: dict, **kwargs): # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # set up RDS and S3 connections, engines, cursors region = AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) # The queries that are specific to the crash data and are not run anywhere else add_columns_query = """ DROP TABLE IF EXISTS tmp.crash_details; CREATE TABLE tmp.crash_details AS ( SELECT * ,CASE WHEN PERSONTYPE = 'Driver' AND AGE >=65 THEN 1 ELSE 0 END AS DRIVERS_OVER_65 ,CASE WHEN PERSONTYPE = 'Driver' AND AGE <=25 THEN 1 ELSE 0 END AS DRIVERS_UNDER_25 ,CASE WHEN PERSONTYPE = 'Pedestrian' AND AGE >=65 THEN 1 ELSE 0 END AS PEDS_OVER_65 ,CASE WHEN PERSONTYPE = 'Pedestrian' AND AGE <=12 THEN 1 ELSE 0 END AS PEDS_UNDER_12 ,CASE WHEN PERSONTYPE = 'Bicyclist' AND AGE >=65 THEN 1 ELSE 0 END AS BIKERS_OVER_65 ,CASE WHEN PERSONTYPE = 'Bicyclist' AND AGE <=18 THEN 1 ELSE 0 END AS BIKERS_UNDER_18 ,CASE WHEN PERSONTYPE = 'Driver' AND LICENSEPLATESTATE <> 'DC' AND LICENSEPLATESTATE <> ' None' THEN 1 ELSE 0 END AS OOS_VEHICLES ,CASE WHEN PERSONTYPE = 'Driver' AND INVEHICLETYPE = 'Passenger Car/automobile' THEN 1 ELSE 0 END AS NUM_CARS ,CASE WHEN PERSONTYPE = 'Driver' AND INVEHICLETYPE in ('Suv (sport Utility Vehicle)', 'Pickup Truck') THEN 1 ELSE 0 END AS NUM_SUVS_OR_TRUCKS ,CASE WHEN PERSONTYPE = 'Pedestrian' AND FATAL='Y' THEN 1 ELSE 0 END AS PED_FATALITIES ,CASE WHEN PERSONTYPE = 'Bicyclist' AND FATAL='Y'THEN 1 ELSE 0 END AS BICYCLE_FATALITIES ,CASE WHEN PERSONTYPE in ('Driver','Passenger') AND FATAL='Y' THEN 1 ELSE 0 END AS VEHICLE_FATALITIES ,CASE WHEN PERSONTYPE = 'Pedestrian' AND (MAJORINJURY='Y' OR MINORINJURY ='Y')THEN 1 ELSE 0 END AS PED_INJURIES ,CASE WHEN PERSONTYPE = 'Bicyclist' AND (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS BICYCLE_INJURIES ,CASE WHEN PERSONTYPE in ('Driver','Passenger') AND (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS VEHICLE_INJURIES ,CASE WHEN PERSONTYPE = 'Driver' AND TICKETISSUED ='Y' THEN 1 ELSE 0 END AS DRIVER_TICKETS ,CASE WHEN PERSONTYPE = 'Driver' AND SPEEDING ='Y' THEN 1 ELSE 0 END AS DRIVERS_SPEEDING ,CASE WHEN PERSONTYPE = 'Driver' AND IMPAIRED ='Y' THEN 1 ELSE 0 END AS DRIVERS_IMPAIRED ,CASE WHEN PERSONTYPE = 'Bicyclist' AND TICKETISSUED ='Y' THEN 1 ELSE 0 END AS BICYCLE_TICKETS ,CASE WHEN PERSONTYPE = 'Pedestrian' AND TICKETISSUED ='Y' THEN 1 ELSE 0 END AS PED_TICKETS ,CASE WHEN (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS TOTAL_INJURIES ,CASE WHEN MAJORINJURY='Y' THEN 1 ELSE 0 END AS TOTAL_MAJOR_INJURIES ,CASE WHEN MINORINJURY ='Y' THEN 1 ELSE 0 END AS TOTAL_MINOR_INJURIES ,CASE WHEN PERSONTYPE = 'Driver' THEN 1 ELSE 0 END AS TOTAL_VEHICLES ,CASE WHEN PERSONTYPE = 'Pedestrian' THEN 1 ELSE 0 END AS TOTAL_PEDESTRIANS ,CASE WHEN PERSONTYPE = 'Bicyclist' THEN 1 ELSE 0 END AS TOTAL_BICYCLISTS FROM source_data.crash_details ) """ group_by_query = """ DROP TABLE IF EXISTS tmp.crash_details_agg; CREATE TABLE tmp.crash_details_agg AS ( SELECT CRIMEID ,SUM(DRIVERS_OVER_65) AS DRIVERS_OVER_65 ,SUM(DRIVERS_UNDER_25) AS DRIVERS_UNDER_25 ,SUM(PEDS_OVER_65) AS PEDS_OVER_65 ,SUM(PEDS_UNDER_12) AS PEDS_UNDER_12 ,SUM(BIKERS_OVER_65) AS BIKERS_OVER_65 ,SUM(BIKERS_UNDER_18) AS BIKERS_UNDER_18 ,SUM(OOS_VEHICLES) AS OOS_VEHICLES ,SUM(NUM_CARS) AS NUM_CARS ,SUM(NUM_SUVS_OR_TRUCKS) AS NUM_SUVS_OR_TRUCKS ,SUM(PED_INJURIES) AS PEDESTRIAN_INJURIES ,SUM(BICYCLE_INJURIES) AS BICYCLE_INJURIES ,SUM(VEHICLE_INJURIES) AS VEHICLE_INJURIES ,SUM(PED_FATALITIES) AS PEDESTRIAN_FATALITIES ,SUM(BICYCLE_FATALITIES) AS BICYCLE_FATALITIES ,SUM(VEHICLE_FATALITIES) AS VEHICLE_FATALITIES ,SUM(DRIVER_TICKETS) AS DRIVER_TICKETS ,SUM(DRIVERS_SPEEDING) AS DRIVERS_SPEEDING ,SUM(DRIVERS_IMPAIRED) AS DRIVERS_IMPAIRED ,SUM(BICYCLE_TICKETS) AS BICYCLE_TICKETS ,SUM(PED_TICKETS) AS PED_TICKETS ,SUM(TOTAL_INJURIES) AS TOTAL_INJURIES ,SUM(TOTAL_MAJOR_INJURIES) AS TOTAL_MAJOR_INJURIES ,SUM(TOTAL_MINOR_INJURIES) AS TOTAL_MINOR_INJURIES ,SUM(TOTAL_VEHICLES) AS TOTAL_VEHICLES ,SUM(TOTAL_PEDESTRIANS) AS TOTAL_PEDESTRIANS ,SUM(TOTAL_BICYCLISTS) AS TOTAL_BICYCLISTS ,ARRAY_AGG(PERSONTYPE) AS PERSONTYPE_ARRAY ,ARRAY_AGG(INVEHICLETYPE) AS INVEHICLETYPE_ARRAY ,ARRAY_AGG(LICENSEPLATESTATE) AS LICENSEPLATESTATE_ARRAY FROM tmp.crash_details GROUP BY CRIMEID ) ; create index crime_id on tmp.crash_details_agg (crimeid); """ join_query = """ DROP TABLE IF EXISTS tmp.crashes_join; CREATE TABLE tmp.crashes_join AS ( SELECT a.OBJECTID ,a.CRIMEID ,a.REPORTDATE ,a.FROMDATE ,a.TODATE ,a.ADDRESS ,a.mpdlatitude ,a.mpdlongitude ,CASE WHEN b.CRIMEID IS NULL OR b.BICYCLE_INJURIES < (a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) THEN (a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) ELSE b.BICYCLE_INJURIES END AS BICYCLE_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.VEHICLE_INJURIES < (a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER) THEN (a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER) ELSE b.VEHICLE_INJURIES END AS VEHICLE_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.PEDESTRIAN_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN) THEN (a.MAJORINJURIES_PEDESTRIAN + a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN) ELSE b.PEDESTRIAN_INJURIES END AS PEDESTRIAN_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN +a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER +a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) THEN (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN +a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER +a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) ELSE b.TOTAL_INJURIES end as TOTAL_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_MAJOR_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+ +a.MAJORINJURIES_DRIVER+a.MAJORINJURIESPASSENGER +a.MAJORINJURIES_BICYCLIST) THEN (a.MAJORINJURIES_PEDESTRIAN+a.MAJORINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MAJORINJURIES_BICYCLIST) ELSE b.TOTAL_MAJOR_INJURIES end as TOTAL_MAJOR_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_MINOR_INJURIES < (a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN +a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER +a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) THEN (a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN +a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER +a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST) ELSE b.TOTAL_MINOR_INJURIES end as TOTAL_MINOR_INJURIES ,CASE WHEN b.CRIMEID IS NULL OR b.BICYCLE_FATALITIES < a.FATAL_BICYCLIST THEN a.FATAL_BICYCLIST ELSE b.BICYCLE_FATALITIES END AS BICYCLE_FATALITIES ,CASE WHEN b.CRIMEID IS NULL OR b.PEDESTRIAN_FATALITIES < a.FATAL_PEDESTRIAN THEN a.FATAL_PEDESTRIAN ELSE b.PEDESTRIAN_FATALITIES END AS PEDESTRIAN_FATALITIES ,CASE WHEN b.CRIMEID IS NULL OR b.VEHICLE_FATALITIES < (a.FATAL_DRIVER+a.FATALPASSENGER) THEN (a.FATAL_DRIVER+a.FATALPASSENGER) ELSE b.VEHICLE_FATALITIES END AS VEHICLE_FATALITIES ,CASE WHEN b.CRIMEID IS NULL or b.DRIVERS_IMPAIRED < a.DRIVERSIMPAIRED THEN a.DRIVERSIMPAIRED ELSE b.DRIVERS_IMPAIRED END AS DRIVERS_IMPAIRED ,CASE WHEN b.CRIMEID IS NULL or b.DRIVERS_SPEEDING < a.SPEEDING_INVOLVED THEN a.SPEEDING_INVOLVED ELSE b.DRIVERS_SPEEDING END AS DRIVERS_SPEEDING ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_VEHICLES < a.TOTAL_VEHICLES THEN a.TOTAL_VEHICLES ELSE b.TOTAL_VEHICLES END AS TOTAL_VEHICLES ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_BICYCLISTS < a.TOTAL_BICYCLES THEN a.TOTAL_BICYCLES ELSE b.TOTAL_BICYCLISTS END AS TOTAL_BICYCLISTS ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_PEDESTRIANS < a.TOTAL_PEDESTRIANS THEN a.TOTAL_PEDESTRIANS ELSE b.TOTAL_PEDESTRIANS END AS TOTAL_PEDESTRIANS ,b.DRIVERS_OVER_65 ,b.DRIVERS_UNDER_25 ,b.PEDS_OVER_65 ,b.PEDS_UNDER_12 ,b.BIKERS_OVER_65 ,b.BIKERS_UNDER_18 ,b.OOS_VEHICLES ,b.NUM_CARS ,b.NUM_SUVS_OR_TRUCKS ,b.DRIVER_TICKETS ,b.BICYCLE_TICKETS ,b.PED_TICKETS ,b.PERSONTYPE_ARRAY ,b.INVEHICLETYPE_ARRAY ,b.LICENSEPLATESTATE_ARRAY ,a.INTAPPROACHDIRECTION ,a.LOCATIONERROR ,a.LASTUPDATEDATE ,a.BLOCKKEY ,a.SUBBLOCKKEY ,ST_Force2D(a.geography::geometry) as geography FROM source_data.crashes_raw a LEFT JOIN tmp.crash_details_agg b on a.CRIMEID = b.CRIMEID WHERE date_part('year', a.fromdate) >=2015 ) ; CREATE INDEX crashes_geom_idx ON tmp.crashes_join USING GIST (geography); """ # join in the pulsepoint info pulsepoint_join_query = """ DROP TABLE IF EXISTS tmp.crash_pulsepoint_join; CREATE TABLE tmp.crash_pulsepoint_join AS (SELECT * FROM ( SELECT DISTINCT a.* ,b.Agency_Incident_ID as pp_agency_incident_id ,b.unit_status_transport as pp_total_injuries ,b.transport_unit_is_amr as pp_total_minor_injuries ,b.transport_unit_is_non_amr as pp_total_major_injuries ,Row_Number() over (partition by a.objectid order by ST_Distance(a.geography, b.geography)) as PP_Call_Distance_Rank ,Row_Number() over (partition by a.objectid order by (a.reportdate at time zone 'America/New_York') - (b.CALL_RECEIVED_DATETIME at time zone 'America/New_York')) as PP_Call_Time_Rank FROM tmp.crashes_join a LEFT JOIN analysis_data.pulsepoint b on ST_DWITHIN(a.geography, b.geography, 200) AND cast(fromdate as date) =cast((call_received_datetime at time zone 'America/New_York') as date) AND (b.CALL_RECEIVED_DATETIME at time zone 'America/New_York') < (a.reportdate at time zone 'America/New_York') ) tmp WHERE PP_Call_Distance_Rank = 1 ) ; CREATE INDEX IF NOT EXISTS crash_pulsepoint_join_geom_idx ON tmp.crash_pulsepoint_join USING GIST (geography); alter table tmp.crash_pulsepoint_join drop column PP_Call_Distance_Rank; """ # First execute the table-specific queries engine.execute(add_columns_query) print("add columns query complete") engine.execute(group_by_query) print("group by query complete") engine.execute(join_query) print("join query complete") engine.execute(pulsepoint_join_query) print("pulsepoint join query complete") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='crashes_nbh_ward', from_schema='tmp', from_table='crash_pulsepoint_join', partition_by_field='objectid') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='crashes_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") next_tables = add_walkscore_info(engine=engine, target_schema='tmp', target_table='crashes_walkscore', from_schema=next_tables[0], from_table=next_tables[1]) print("walkscore query complete") next_tables = add_roadway_info(engine=engine, target_schema='tmp', target_table='crashes_roadway_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=0.001) print("roadway info query complete") next_tables = add_intersection_info( engine=engine, target_schema='tmp', target_table='crashes_intersection_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=10) print("intersection info query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='dc_crashes_w_details', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)
def generate_pulsepoint_analysis_table(AWS_Credentials: dict, **kwargs): # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # set up RDS and S3 connections, engines, cursors region = AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) # flag that some records might be duplicate calls for the same incident dupe_check_query = """ DROP TABLE IF EXISTS tmp.pulsepoint_dupe_check; CREATE TABLE tmp.pulsepoint_dupe_check AS ( SELECT DISTINCT a.* , case when b.incident_id is null then 1 when a.num_units_responding = 0 and b.num_units_responding >0 then 0 when b.unit_status_transport > a.unit_status_transport then 0 when b.num_units_responding > a.num_units_responding then 0 when b.call_received_datetime < a.call_received_datetime then 0 else 1 end as KEEP_RECORD_FLAG FROM source_data.pulsepoint a LEFT JOIN source_data.pulsepoint b on a.incident_id <> b.incident_id and date_part('day', a.call_received_datetime - b.call_received_datetime) = 0 and date_part('hour', a.call_received_datetime - b.call_received_datetime) = 0 and date_part('month', a.call_received_datetime - b.call_received_datetime) = 0 and abs(date_part('minute', a.call_received_datetime - b.call_received_datetime)) <=20 and ST_DWithin(a.geography, b.geography, 100) and a.Agency_ID = b.Agency_ID and (a.num_units_responding = 0 or a.unit_ids && b.unit_ids) ) ; CREATE INDEX IF NOT EXISTS pulsepoint_dupe_check_geom_idx ON tmp.pulsepoint_dupe_check USING GIST (geography); """ # then join to the crashes table crashes_join_query = """ DROP TABLE IF EXISTS tmp.pulsepoint_crash_join; CREATE TABLE tmp.pulsepoint_crash_join AS (SELECT * FROM ( SELECT DISTINCT a.* ,concat(a.agency_id, a.incident_id) as Agency_Incident_ID ,b.objectid as Crash_Objectid ,b.geography as Crash_Geo ,b.total_injuries as Crash_Total_Injuries ,b.total_major_injuries as Crash_Total_Major_Injuries ,b.total_minor_injuries as Crash_Total_Minor_Injuries ,(b.bicycle_fatalities + b.pedestrian_fatalities + b.vehicle_fatalities) as Crash_Total_Fatalities ,b.bicycle_injuries as Crash_Bike_Injuries ,b.vehicle_injuries as Crash_Car_Injuries ,b.pedestrian_injuries as Crash_Ped_Injuries ,case when b.total_injuries is null or b.total_injuries < a.unit_status_transport then 1 else 0 end as injuries_mismatch ,ST_Distance(a.geography, b.geography) as Distance_To_Crash ,(b.reportdate at time zone 'America/New_York') - (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York') as Time_Between_Crash_And_Report ,b.intersectionid as Crash_Intersection_ID ,b.block_objectid as Crash_Block_Objectid ,Row_Number() over (partition by a.incident_id, a.agency_id order by ST_Distance(a.geography, b.geography)) as Crash_Distance_Rank ,Row_Number() over (partition by a.incident_id, a.agency_id order by (b.reportdate at time zone 'America/New_York') - (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York')) as Crash_Time_Rank FROM tmp.pulsepoint_dupe_check a LEFT JOIN analysis_data.dc_crashes_w_details b on ST_DWITHIN(a.geography, b.geography, 200) AND cast(b.fromdate as date) =cast((call_received_datetime at time zone 'America/New_York') as date) AND (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York') < (b.reportdate at time zone 'America/New_York') WHERE a.KEEP_RECORD_FLAG = 1 ) tmp WHERE Crash_Distance_Rank = 1 and (incident_type in ('TC', 'TCE', 'TCS') or (agency_id = '16000' and incident_type in ('TC', 'TCS', 'TCE', 'RES'))) ) ; CREATE INDEX IF NOT EXISTS pulsepoint_crash_join_geom_idx ON tmp.pulsepoint_crash_join USING GIST (geography); alter table tmp.pulsepoint_crash_join drop column KEEP_RECORD_FLAG; alter table tmp.pulsepoint_crash_join drop column Crash_Distance_Rank; """ # First execute the table-specific queries engine.execute(dupe_check_query) print("dupe check query complete") engine.execute(crashes_join_query) print("join to crashes query complete") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='pulsepoint_nbh_ward', from_schema='tmp', from_table='pulsepoint_crash_join', partition_by_field='Agency_Incident_ID') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='pulsepoint_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") next_tables = add_walkscore_info(engine=engine, target_schema='tmp', target_table='pulsepoint_walkscore', from_schema=next_tables[0], from_table=next_tables[1]) print("walkscore query complete") next_tables = add_roadway_info(engine=engine, target_schema='tmp', target_table='pulsepoint_roadway_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='Agency_Incident_ID', within_distance=100) print("roadway info query complete") next_tables = add_intersection_info( engine=engine, target_schema='tmp', target_table='pulsepoint_intersection_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='Agency_Incident_ID', within_distance=60) print("intersection info query complete") next_tables = is_national_park(engine=engine, target_schema='tmp', target_table='pulsepoint_national_park', from_schema=next_tables[0], from_table=next_tables[1]) print("national parks info query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='pulsepoint', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)
def refresh_test_db(env: str): engine = create_postgres_engine(destination="AWS_PostGIS", env=env.upper()) db_credentials = get_connection_strings("AWS_PostGIS") db_users = db_credentials[env.upper()]['USERS'] prod_db_host = db_credentials['PROD']['HOST'] prod_db_port = db_credentials['PROD']['PORT'] prod_db_name = db_credentials['PROD']['DB'] prod_engine = create_postgres_engine(destination="AWS_PostGIS", env="PROD") create_fdw_query = """ BEGIN; CREATE EXTENSION IF NOT EXISTS postgres_fdw; DROP SERVER IF EXISTS prod CASCADE; CREATE SERVER prod FOREIGN DATA WRAPPER postgres_fdw OPTIONS (host '{prod_db_host}', dbname '{prod_db_name}'); COMMIT; """.format(prod_db_name=prod_db_name, prod_db_host=prod_db_host) engine.execute(create_fdw_query) # create user mappings for user_pwd in db_users: user = list(user_pwd.keys())[0] pwd = user_pwd[user] map_user_query = """ CREATE USER MAPPING FOR {user} SERVER prod OPTIONS (user '{user}', password '{pwd}'); """.format(user=user, pwd=pwd) engine.execute(map_user_query) # pull the schemas off the viz copy of the prod database schemas = [(r, 'prod_' + r) for (r, ) in prod_engine.execute( "select distinct table_schema from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%'" ).fetchall()] # map schemas for source_schema, destination_schema in schemas: create_schema_query = """ CREATE SCHEMA IF NOT EXISTS {destination_schema}; GRANT ALL PRIVILEGES ON SCHEMA {destination_schema} TO PUBLIC; IMPORT FOREIGN SCHEMA {source_schema} FROM SERVER prod INTO {destination_schema}; """.format(source_schema=source_schema, destination_schema=destination_schema) engine.execute(create_schema_query) # pull all the tables from prod db schemas_tables = [(schema, table) for ( schema, table ) in prod_engine.execute( "select distinct table_schema,table_name from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%' and table_name not like '[%%]'" ).fetchall()] # create and populate tables for schema, table in schemas_tables: create_populate_tables_query = """ CREATE TABLE IF NOT EXISTS {schema}."{table}" (LIKE prod_{schema}."{table}"); DELETE FROM {schema}."{table}"; INSERT INTO {schema}."{table}" SELECT * FROM prod_{schema}."{table}"; GRANT ALL PRIVILEGES ON {schema}."{table}" TO PUBLIC; """.format(schema=schema, table=table) print(create_populate_tables_query) engine.execute(create_populate_tables_query)
def create_test_db(env: str, test_db_name: str): # connect to the prod db engine = create_postgres_engine(destination="AWS_PostGIS", env="PROD") db_credentials = get_connection_strings("AWS_PostGIS") # get prod master credentials prod_db_host = db_credentials['PROD']['HOST'] prod_db_port = db_credentials['PROD']['PORT'] prod_db_name = db_credentials['PROD']['DB'] prod_db_uid = db_credentials['PROD']['UID'] prod_db_pwd = db_credentials['PROD']['PWD'] # get testdb credentials test_db_host = db_credentials[env.upper()]['HOST'] test_db_port = db_credentials[env.upper()]['PORT'] test_db_name = db_credentials[env.upper()]['DB'] test_db_uid = db_credentials[env.upper()]['UID'] test_db_pwd = db_credentials[env.upper()]['PWD'] test_db_users = db_credentials[env.upper()]['USERS'] kill_db_query = """ SELECT pg_terminate_backend (pid) FROM pg_stat_activity WHERE pg_stat_activity.datname = '{0}'; """.format(test_db_name) # kill engine.execute(kill_db_query) # drop command = 'DROP DATABASE IF EXISTS {}'.format(test_db_name) os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={} --no-password --command=\"{}\"'.format( prod_db_pwd, prod_db_host, prod_db_port, prod_db_uid, prod_db_name, command) os.system(os_system_arg) # create command = 'CREATE DATABASE {}'.format(test_db_name) os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={} --no-password --command=\"{}\"'.format( prod_db_pwd, prod_db_host, prod_db_port, prod_db_uid, prod_db_name, command) os.system(os_system_arg) # create users on new db for user_pwd in test_db_users: user = list(user_pwd.keys())[0] pwd = user_pwd[user] command = 'CREATE ROLE {} WITH LOGIN ENCRYPTED PASSWORD \'{}\';'.format( user, pwd) os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) print(os_system_arg) os.system(os_system_arg) # install PostGIS extensions command = """ CREATE EXTENSION postgis; CREATE EXTENSION fuzzystrmatch; CREATE EXTENSION postgis_tiger_geocoder; CREATE EXTENSION postgis_topology; """ os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) os.system(os_system_arg) # alter schemas command = """ ALTER SCHEMA tiger OWNER TO rds_superuser; ALTER SCHEMA tiger_data OWNER TO rds_superuser; ALTER SCHEMA topology OWNER TO rds_superuser; """ os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) os.system(os_system_arg) # create function command = 'CREATE FUNCTION exec(text) returns text language plpgsql volatile AS \$f\$ BEGIN EXECUTE \$1; RETURN \$1; END; \$f\$;' os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) os.system(os_system_arg) # execute function command = """ SELECT exec('ALTER TABLE ' || quote_ident(s.nspname) || '.' || quote_ident(s.relname) || ' OWNER TO rds_superuser;') FROM ( SELECT nspname, relname FROM pg_class c JOIN pg_namespace n ON (c.relnamespace = n.oid) WHERE nspname in ('tiger','topology') AND relkind IN ('r','S','v') ORDER BY relkind = 'S') s; """ os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format( test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name, command) os.system(os_system_arg) # get all schemas on prod db schemas = [ r for (r, ) in engine.execute( "select distinct table_schema from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%'" ).fetchall() ] # create engine on test db test_engine = create_postgres_engine(destination="AWS_PostGIS", env=env.upper()) # create schemas for schema in schemas: create_schema_query = """ CREATE SCHEMA IF NOT EXISTS {0}; GRANT ALL PRIVILEGES ON SCHEMA {0} TO PUBLIC; """.format(schema) test_engine.execute(create_schema_query)
def generate_pulsepoint_table (AWS_Credentials:dict, **kwargs): # assign optional arguments target_table='pulsepoint' target_schema=kwargs.get('target_schema', None) if target_schema == None: target_schema='source_data' target_table=kwargs.get('target_table', None) if target_table == None: target_table='pulsepoint' # if no environment is specified default to dev env=kwargs.get('env', None) if env == None: env='DEV' env=env.upper() # set up RDS and S3 connections, engines, cursors region=AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) step1_query =""" DROP TABLE IF EXISTS tmp_pulsepoint; CREATE TEMP TABLE tmp_pulsepoint ON COMMIT PRESERVE ROWS AS ( SELECT Agency_ID ,Incident_ID ,Scrape_Datetime ,CALL_RECEIVED_DATETIME ,CALL_Closed_DATETIME ,FullDisplayAddress ,longitude ,latitude ,Incident_Type ,Unit ,Unit_Status_Transport ,Transport_Unit_Is_AMR ,Transport_Unit_Is_Non_AMR ,Unit_JSON ,Num_Units_Responding ,geography FROM ( SELECT Agency_ID ,Incident_ID ,Scrape_Datetime ,CALL_RECEIVED_DATETIME ,CALL_Closed_DATETIME ,FullDisplayAddress ,longitude ,latitude ,Incident_Type ,Unit ,MAX(Unit_Status_Transport) over (Partition by Agency_ID, Incident_ID) as Unit_Status_Transport ,MAX(Transport_Unit_Is_AMR) over (Partition by Agency_ID, Incident_ID) as Transport_Unit_Is_AMR ,MAX(Transport_Unit_Is_Non_AMR) over (Partition by Agency_ID, Incident_ID) as Transport_Unit_Is_Non_AMR ,ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography as geography ,cast(replace(unit,'''','"') as jsonb) as Unit_JSON ,JSONB_ARRAY_LENGTH(cast(replace(unit,'''','"') as jsonb)::jsonb) as Num_Units_Responding ,ROW_NUMBER() over (Partition by Agency_ID, Incident_ID order by Scrape_Datetime DESC) as Time_Rank FROM source_data.pulsepoint_stream ) AS tmp WHERE Time_Rank = 1 ) WITH DATA; """ step_2_query = """ DROP TABLE IF EXISTS tmp_pulsepoint_units; CREATE TEMP TABLE tmp_pulsepoint_units ON COMMIT PRESERVE ROWS AS ( SELECT incident_id, Agency_ID, array_agg((Units#>'{{UnitID}}')::text) as Unit_IDs FROM tmp_pulsepoint CROSS JOIN json_array_elements(unit_json::json) as Units GROUP BY incident_id, Agency_ID ) WITH DATA; """ step_3_query = """ DROP TABLE IF EXISTS tmp_pulsepoint_units_join; CREATE TEMP TABLE tmp_pulsepoint_units_join ON COMMIT PRESERVE ROWS AS ( SELECT DISTINCT a.*, b.Unit_IDs FROM tmp_pulsepoint a LEFT JOIN tmp_pulsepoint_units b on a.incident_id = b.incident_id and a.agency_id = b.agency_id ) WITH DATA; """ final_query=""" DROP TABLE IF EXISTS {0}.{1}; CREATE TABLE {0}.{1} AS SELECT * FROM tmp_pulsepoint_units_join; GRANT ALL PRIVILEGES ON {0}.{1} TO PUBLIC; """.format(target_schema, target_table) engine.execute(step1_query) engine.execute(step_2_query) engine.execute(step_3_query) engine.execute(final_query)
def extract_twitter_json(target_schema: str, source_table: str, target_table: str, AWS_Credentials: dict, **kwargs): # assign optional arguments source_schema = kwargs.get('source_schema', None) if source_schema == None: source_schema = 'stg' # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # set up RDS and S3 connections, engines, cursors region = AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) # extract the json info step1_query = """ DROP TABLE IF EXISTS tmp.twitter; CREATE TABLE tmp.twitter AS ( WITH results AS ( SELECT jsonb_array_elements(test.tweets) AS data, source_file, load_datetime FROM ( SELECT data->'data' as tweets, source_file, load_datetime from {}."{}" ) as test ) SELECT (data->'created_at')::varchar::timestamptz AS created_at ,(data->'id_str')::varchar AS tweet_id ,(data->'user'->'id_str')::varchar AS user_id ,CASE WHEN (data ? 'retweeted_status') THEN (data->'retweeted_status'->'id_str')::varchar END AS retweeted_status_id ,CASE WHEN(data->'in_reply_to_status_id_str')::varchar <> 'null' THEN (data->'in_reply_to_status_id_str')::varchar END AS in_reply_to_status_id ,REPLACE(REPLACE(CASE WHEN (data ? 'full_text') THEN (data->'full_text')::varchar WHEN (data ? 'text') THEN (data->'text')::varchar END, '&', '&'),'%%','percent') AS tweet_text ,source_file ,load_datetime ,data FROM results ); """.format(source_schema, source_table) engine.execute(step1_query) # geocode records records = [ r for (r, ) in engine.execute( "select distinct tweet_text from tmp.twitter").fetchall() ] print(len(records), " records passed to geocode function") geocode_text(engine=engine, records_to_geocode=records, administrative_area='District of Columbia', text_type='Tweet') # join the geocoded text back into the main table step_2_query = """ DROP TABLE IF EXISTS tmp.twitter_geocode; CREATE TABLE tmp.twitter_geocode AS ( SELECT DISTINCT a.* ,b.point_type ,b.point_geography ,b.polygon_geography FROM tmp.twitter a LEFT JOIN source_data.geocoded_text b on a.tweet_text = b.text ) ; """ final_query = """ CREATE TABLE IF NOT EXISTS {0}.{1} (LIKE tmp.twitter_geocode); INSERT INTO {0}.{1} SELECT * FROM tmp.twitter_geocode; GRANT ALL PRIVILEGES ON {0}.{1} TO PUBLIC; """.format(target_schema, target_table) engine.execute(step1_query) engine.execute(step_2_query) engine.execute(final_query) count_query = 'SELECT COUNT(*) FROM {}.{} WHERE source_file like \'%%{}%%\''.format( target_schema, target_table, source_table) row_count = engine.execute(count_query).fetchone()[0] print("{} rows inserted into final table from file {}".format( str(row_count), source_table)) drop_table_query = 'DROP TABLE IF EXISTS {}."{}"'.format( source_schema, source_table) engine.execute(drop_table_query)
import geopandas as gpd import pandas as pd import boto3 import os from pathlib import Path from connect_to_rds import get_connection_strings, create_postgres_engine # set up S3 connection AWS_Credentials = get_connection_strings("AWS_DEV") s3 = boto3.resource( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) bucket_name = AWS_Credentials['s3_bucket'] region = AWS_Credentials['region'] engine = create_postgres_engine("AWS_PostGIS", "postgres", "DEV") resources = { 'crashes_raw': { 'url': 'https://opendata.arcgis.com/datasets/70392a096a8e431381f1f692aaa06afd_24.geojson', 'prefix': 'source-data/dc-open-data/', 'metadata': { 'target_schema': 'source_data', "dataset_info": "https://opendata.dc.gov/datasets/crashes-in-dc" } }, 'crash_details': { 'url': 'https://opendata.arcgis.com/datasets/70248b73c20f46b0a5ee895fc91d6222_25.geojson', 'prefix': 'source-data/dc-open-data/',
def csv_to_postGIS(folder_to_load: str, AWS_Credentials: dict, **kwargs): # assign optional arguments target_schema = kwargs.get('target_schema', None) if target_schema == None: target_schema = 'stg' move_to_folder = kwargs.get('move_to_folder', None) clean_columns = kwargs.get('clean_columns', None) # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # list of all loaded tables tables_created = [] # set up RDS and S3 connections, engines, cursors s3_resource = boto3.resource( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) bucket_name = AWS_Credentials['s3_bucket'] bucket = s3_resource.Bucket(bucket_name) client = boto3.client( 's3', aws_access_key_id=AWS_Credentials['aws_access_key_id'], aws_secret_access_key=AWS_Credentials['aws_secret_access_key']) region = AWS_Credentials['region'] connection = create_psycopg2_connection(destination="AWS_PostGIS", env=env) engine = create_postgres_engine(destination="AWS_PostGIS", env=env) files_to_load = [ obj.key for obj in bucket.objects.filter(Prefix=folder_to_load) if '.csv' in obj.key ] for object_key in files_to_load: # get base file name to use as table name stg_tble = os.path.basename(object_key) # get the headers # get the headers to create columns sql_stmt = """SELECT S.* FROM s3object S LIMIT 1""" # pull the header row from target csv req = client.select_object_content( Bucket=bucket_name, Key=object_key, ExpressionType='SQL', Expression=sql_stmt, InputSerialization={ 'CSV': { 'FileHeaderInfo': 'NONE', 'AllowQuotedRecordDelimiter': True } }, OutputSerialization={'CSV': {}}, ) # format csv headers into a list for event in req['Payload']: if 'Records' in event: file_str = ''.join( event['Records']['Payload'].decode('utf-8')).lower() columns_list = file_str.split(',') # and then make a column create string out of them create_columns_statement = '' for column in columns_list: create_columns_statement += ',"' + column.replace( '\n', '').lower() + '" VARCHAR NULL' #generate create table shell script create_table_query = """ DROP TABLE IF EXISTS {0}."{1}"; CREATE TABLE IF NOT EXISTS {0}."{1}" ( LOAD_DATETIME TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP ,source_file varchar DEFAULT '{2}' {3} ); """.format(target_schema, stg_tble, object_key, create_columns_statement) # create the table with connection.cursor() as cursor: cursor.execute(create_table_query) connection.commit() # and then execute the query to fix the column names, if that parameter was passed if clean_columns is not None and clean_columns.lower() == 'yes': fix_column_names_query = """ SELECT replace(replace(SQL_Statement::varchar,\'";\', \';\'), \'u_"\', \'u_\') FROM ( SELECT FORMAT( \'ALTER TABLE %%I.%%I.%%I RENAME %%I to u_%%I;\', table_catalog, table_schema, table_name, column_name, lower( regexp_replace( replace(replace(replace(replace(replace(replace(replace(column_name, \' \', \'_\'),\'?\',\'\'),\'/\',\'_\'),\'&\', \'and\'),\'(\',\'\'),\')\',\'\'),\'"\',\'\') ,\'([[:lower:]])([[:upper:]])\', \'\\1_\\2\', \'xg\' ) ) ) AS SQL_Statement FROM information_schema.columns WHERE table_name = \'{}\' and lower(column_name)!=\'load_datetime\' and lower(column_name) != \'source_file\' ) AS tmp; """.format(stg_tble) list_of_statements = [ r for (r, ) in engine.execute(fix_column_names_query).fetchall() ] for statement in list_of_statements: engine.execute(statement) # get the updated list of corrected column names to pass to the copy columns parameter get_updated_columns_query = """ SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{0}' AND TABLE_NAME = '{1}' """.format(target_schema, stg_tble) # put column names of source table in list columns_list = [ r for (r, ) in engine.execute(get_updated_columns_query).fetchall() if 'load_datetime' not in r.lower() if 'source_file' not in r.lower() ] file_str = ','.join(columns_list).lower() # load the target csv into memory f = client.get_object(Bucket=bucket_name, Key=object_key) # decode it as string f2 = pd.read_csv(f['Body']) buffer = io.StringIO() f2.to_csv(buffer, index=False, header=False) buffer.seek(0) # copy the file into the table with connection.cursor() as cursor: cursor.copy_expert( ' COPY {0}."{1}" ({2}) FROM STDIN WITH CSV; '.format( target_schema, stg_tble, file_str), buffer) connection.commit() if move_to_folder is not None: s3_resource.Object(bucket_name, move_to_folder + stg_tble).copy_from(CopySource={ 'Bucket': bucket_name, 'Key': object_key }) s3_resource.Object(bucket_name, object_key).delete() tables_created.append((target_schema, "{}".format(stg_tble))) return tables_created
def generate_moving_violations_table(AWS_Credentials: dict, **kwargs): # if no environment is specified default to dev env = kwargs.get('env', None) if env == None: env = 'DEV' env = env.upper() # set up RDS and S3 connections, engines, cursors region = AWS_Credentials['region'] engine = create_postgres_engine(destination="AWS_PostGIS", env=env) # First move all source data records to a temp table step_1_query = """ CREATE TABLE IF NOT EXISTS tmp.moving_violations_need_geo as SELECT * FROM source_data.moving_violations WHERE geography IS NULL; CREATE TABLE IF NOT EXISTS tmp.moving_violations_has_geo as SELECT * FROM source_data.moving_violations WHERE geography IS NOT NULL; CREATE INDEX IF NOT EXISTS mv_location_index ON tmp.moving_violations_need_geo (location); """ engine.execute(step_1_query) print("temp table created") # geocode the locations records = [ loc for (loc, ) in engine.execute( "select distinct location from tmp.moving_violations_need_geo where geography is null limit 2000" ).fetchall() ] print(len(records), " records passed to geocode function") geocode_text(engine=engine, records_to_geocode=records, administrative_area='District of Columbia', text_type='Moving Violations location') # update lat and long values from new data step_2_query = """ UPDATE tmp.moving_violations_need_geo SET geography = source_data.geocoded_text.point_geography FROM source_data.geocoded_text WHERE source_data.geocoded_text.text = location ; INSERT INTO tmp.moving_violations_has_geo SELECT * FROM tmp.moving_violations_need_geo; CREATE INDEX IF NOT EXISTS mv_geom_idx ON tmp.moving_violations_has_geo USING GIST (geography); """ engine.execute(step_2_query) print("geo values updated") # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table next_tables = add_location_info(engine=engine, target_schema='tmp', target_table='moving_violations_nbh_ward', from_schema='tmp', from_table='moving_violations_has_geo', partition_by_field='objectid') print("neighborhood-ward query complete") next_tables = add_school_info(engine=engine, target_schema='tmp', target_table='moving_violations_schools', from_schema=next_tables[0], from_table=next_tables[1]) print("schools query complete") next_tables = add_roadway_info( engine=engine, target_schema='tmp', target_table='moving_violations_roadway_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=50) print("roadway info query complete") next_tables = add_intersection_info( engine=engine, target_schema='tmp', target_table='moving_violations_intersection_info', from_schema=next_tables[0], from_table=next_tables[1], partition_by_field='objectid', within_distance=20) print("intersection info query complete") row_count = create_final_table(engine=engine, target_schema='analysis_data', target_table='moving_violations', from_schema=next_tables[0], from_table=next_tables[1]) print("final query complete with row count ", row_count)