def extract_scanner_audio_json(target_schema: str, source_table: str,
                               target_table: str, AWS_Credentials: dict,
                               **kwargs):

    # assign optional arguments
    source_schema = kwargs.get('source_schema', None)
    if source_schema == None:
        source_schema = 'stg'
    # if no environment is specified default to dev
    env = kwargs.get('env', None)
    if env == None:
        env = 'DEV'
    env = env.upper()

    # set up RDS and S3 connections, engines, cursors
    region = AWS_Credentials['region']
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env)

    # extract the json info
    step1_query = """
    DROP TABLE IF EXISTS tmp.citizen;
    CREATE TABLE tmp.citizen
    AS ( 
        WITH results AS 
            (
            SELECT jsonb_array_elements(test.results) AS data, source_file, load_datetime
            FROM
                (
                SELECT data->'results' as results, source_file, load_datetime
                from {0}."{1}"
                ) as test
            ) 
        SELECT 
            to_timestamp(TRUNC((data->'cs')::bigint)/1000) AS cs
            ,(data->'ll'->0)::numeric AS lat
            ,(data->'ll'->1)::numeric AS long
            ,to_timestamp(TRUNC((data->'ts')::bigint)/1000) AS ts
            ,(data->'key')::varchar as incident_key
            ,(data->'raw')::varchar as incident_desc_raw
            ,(data->'source')::varchar as incident_source
            ,(data->'categories') as incident_categories
            ,source_file
            ,load_datetime
            ,data 
	    FROM results
        );
    """.format(source_schema, source_table)

    step_2_query = """
    DROP TABLE IF EXISTS tmp.citizen_geometry;
    CREATE  TABLE tmp.citizen_geometry
    AS (
        SELECT *, ST_SetSRID(ST_MakePoint(long, lat),4326)::geography as geography
        FROM tmp.citizen
        ) ; 
    """

    final_query = """
    CREATE TABLE IF NOT EXISTS {0}.{1} (LIKE tmp.citizen_geometry);

    INSERT INTO {0}.{1} 
        SELECT * FROM tmp.citizen_geometry;

    GRANT ALL PRIVILEGES ON {0}.{1} TO PUBLIC;
    """.format(target_schema, target_table)

    engine.execute(step1_query)
    engine.execute(step_2_query)
    engine.execute(final_query)

    count_query = 'SELECT COUNT(*) FROM {}.{} WHERE source_file like \'%%{}%%\''.format(
        target_schema, target_table, source_table)

    row_count = engine.execute(count_query).fetchone()[0]
    print("{} rows inserted into final table from file {}".format(
        str(row_count), source_table))

    drop_table_query = 'DROP TABLE IF EXISTS {}."{}"'.format(
        source_schema, source_table)
    engine.execute(drop_table_query)
예제 #2
0
def s3_to_postGIS(folder_to_load: str, AWS_Credentials: dict, format: str,
                  header: str, mode: str, move_after_loading: str,
                  move_to_folder: str):

    # set up S3 and RDS connections
    s3_resource = boto3.resource(
        's3',
        aws_access_key_id=AWS_Credentials['aws_access_key_id'],
        aws_secret_access_key=AWS_Credentials['aws_secret_access_key'])
    bucket_name = AWS_Credentials['s3_bucket']
    bucket = s3_resource.Bucket(bucket_name)
    region = AWS_Credentials['region']
    dbname = 'postgres'
    env = "DEV"
    engine = create_postgres_engine(destination="AWS_PostGIS",
                                    target_db=dbname,
                                    env=env)
    db_credentials = get_connection_strings("AWS_PostGIS")
    db_uid = db_credentials[env]['UID']
    db_pwd = db_credentials[env]['PWD']
    db_host = db_credentials[env]['HOST']
    db_port = db_credentials[env]['PORT']

    # add psql install location to default path
    psql_path = subprocess.check_output(['which',
                                         'psql']).strip().decode('utf-8')
    sys.path.append(psql_path)

    # grab list of all files in target folder that have a target table
    # url encode the file key so the ones with semicolons don't throw an error
    # update january 2021: the script now throws an error if i try to feed it URL-encoded object keys, so just using the plain text one now
    files_to_load = [(urllib.parse.quote(obj.key),
                      obj.key, obj.Object().metadata['target_schema'],
                      obj.Object().metadata['target_table'])
                     for obj in bucket.objects.filter(Prefix=folder_to_load)
                     if 'target_table' in obj.Object().metadata.keys()
                     if format in obj.key]
    # generate distinct list of target tables so they're all only dropped and recreated/truncated one time
    target_tables = [(target_schema, target_table)
                     for (file_name, file_name_native, target_schema,
                          target_table) in files_to_load]
    target_tables_distinct = set(target_tables)
    target_tables = list(target_tables_distinct)

    # drop and recreate and/or truncate each target table
    for (target_schema, target_table) in target_tables:
        generate_table(engine=engine,
                       target_schema=target_schema,
                       target_table=target_table,
                       mode=mode)

    # set table import parameters that are the same for every file
    copy_parameters = '\'(FORMAT {}, HEADER {})\''.format(format, header)
    columns_to_copy = '\'\''
    aws_credentials_param = '\'{}\', \'{}\',\'\''.format(
        AWS_Credentials['aws_access_key_id'],
        AWS_Credentials['aws_secret_access_key'])

    # create file-specific table import parameters
    for (file_name, file_name_native, target_schema,
         target_table) in files_to_load:
        destination_table = '\'{}.{}\''.format(target_schema, target_table)
        create_s3_uri_param = '\'{}\', \'{}\',\'{}\''.format(
            AWS_Credentials['s3_bucket'], file_name_native, region)
        base_file_name = os.path.basename(file_name_native)

        # create import statement
        import_table_query = 'SELECT aws_s3.table_import_from_s3({}, {},{}, aws_commons.create_s3_uri({}) ,aws_commons.create_aws_credentials({}));'.format(
            destination_table, columns_to_copy, copy_parameters,
            create_s3_uri_param, aws_credentials_param)
        # create arg to pass to os.system
        os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={}  --no-password --command=\"{}\"'.format(
            db_pwd, db_host, db_port, db_uid, dbname, import_table_query)
        # execute
        if move_after_loading != 'yes':
            os.system(os_system_arg)
        elif move_after_loading == 'yes' and move_to_folder != '':
            os.system(os_system_arg)
            try:
                s3_resource.Object(bucket_name,
                                   move_to_folder + base_file_name).copy_from(
                                       CopySource={
                                           'Bucket': bucket_name,
                                           'Key': file_name_native
                                       })
                s3_resource.Object(bucket_name, file_name_native).delete()
            except:
                print(file_name_native, " could not be copied and/or deleted")
                continue
        else:
            print("please provide move-to folder")
            continue

    # after data is loaded, update the geographies
    for (target_schema, target_table) in target_tables:
        correct_geo(engine=engine,
                    target_schema=target_schema,
                    target_table=target_table,
                    mode=mode)
import sqlalchemy
from connect_to_rds import get_connection_strings, create_postgres_engine
from add_location_info import add_location_info, add_school_info, create_final_table

dbname = 'postgres'
env = "DEV"
engine = create_postgres_engine(destination="AWS_PostGIS",
                                target_db=dbname,
                                env=env)
db_credentials = get_connection_strings("AWS_PostGIS")

geography_levels = {
    'comp_plan_area': {
        'geo_boundaries_source_table': 'source_data.comp_plan_areas',
        'orig_field_name': 'name'
    },
    'census_tract': {
        'geo_boundaries_source_table': 'source_data.census_tracts',
        'orig_field_name': 'tract'
    },
    'nbh_cluster_names': {
        'geo_boundaries_source_table': 'source_data.neighborhood_clusters',
        'orig_field_name': 'nbh_names'
    },
    'ward_name': {
        'geo_boundaries_source_table': 'source_data.ward_boundaries',
        'orig_field_name': 'name'
    },
    'anc_id': {
        'geo_boundaries_source_table': 'source_data.anc_boundaries',
        'orig_field_name': 'anc_id'
    drop_table_query = 'DROP TABLE IF EXISTS {}."{}"'.format(
        source_schema, source_table)
    engine.execute(drop_table_query)


CLI = argparse.ArgumentParser()
CLI.add_argument("--env", type=str)
CLI.add_argument("--source_schema", type=str)

# parse the command line
args = CLI.parse_args()
env = args.env
source_schema = args.source_schema

if __name__ == "__main__":
    if env == None:
        env = 'DEV'
    env = env.upper()
    # tables_to_extract = json_to_postGIS(folder_to_load='source-data/citizen/unparsed/', move_to_folder = 'source-data/citizen/loaded_to_postgis/', AWS_Credentials=get_connection_strings("AWS_DEV"))
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env)
    tables_to_extract = [
        r for (r, ) in engine.execute(
            "select distinct table_name from information_schema.tables where table_schema = 'stg' and table_name like '%%transcribed_audio%%'"
        )
    ]
    for table in tables_to_extract:
        extract_citizen_json(source_table=table,
                             target_table='citizen_stream',
                             target_schema='source_data',
                             AWS_Credentials=get_connection_strings("AWS_DEV"),
                             env=env)
def generate_crashes_table(AWS_Credentials: dict, **kwargs):

    # if no environment is specified default to dev
    env = kwargs.get('env', None)
    if env == None:
        env = 'DEV'
    env = env.upper()

    # set up RDS and S3 connections, engines, cursors
    region = AWS_Credentials['region']
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env)

    # The queries that are specific to the crash data and are not run anywhere else
    add_columns_query = """
    DROP TABLE IF EXISTS tmp.crash_details;
    CREATE TABLE tmp.crash_details 
    AS (
        SELECT *
            ,CASE WHEN PERSONTYPE = 'Driver' AND AGE >=65 THEN 1 ELSE 0 END AS DRIVERS_OVER_65
            ,CASE WHEN PERSONTYPE = 'Driver' AND AGE <=25 THEN 1 ELSE 0 END AS DRIVERS_UNDER_25
            ,CASE WHEN PERSONTYPE = 'Pedestrian' AND AGE >=65 THEN 1 ELSE 0 END AS PEDS_OVER_65
            ,CASE WHEN PERSONTYPE = 'Pedestrian' AND AGE <=12 THEN 1 ELSE 0 END AS PEDS_UNDER_12
            ,CASE WHEN PERSONTYPE = 'Bicyclist' AND AGE >=65 THEN 1 ELSE 0 END AS BIKERS_OVER_65
            ,CASE WHEN PERSONTYPE = 'Bicyclist' AND AGE <=18 THEN 1 ELSE 0 END AS BIKERS_UNDER_18
            ,CASE WHEN PERSONTYPE = 'Driver' AND LICENSEPLATESTATE <> 'DC' AND LICENSEPLATESTATE <> ' None' THEN 1 ELSE 0 END AS OOS_VEHICLES
            ,CASE WHEN PERSONTYPE = 'Driver' AND INVEHICLETYPE = 'Passenger Car/automobile' THEN 1 ELSE 0 END AS NUM_CARS
            ,CASE WHEN PERSONTYPE = 'Driver' AND INVEHICLETYPE in ('Suv (sport Utility Vehicle)', 'Pickup Truck') THEN 1 ELSE 0 END AS NUM_SUVS_OR_TRUCKS

            ,CASE WHEN PERSONTYPE = 'Pedestrian' AND FATAL='Y' THEN 1 ELSE 0 END AS PED_FATALITIES
            ,CASE WHEN PERSONTYPE = 'Bicyclist' AND FATAL='Y'THEN 1 ELSE 0 END AS BICYCLE_FATALITIES
            ,CASE WHEN PERSONTYPE in ('Driver','Passenger') AND FATAL='Y' THEN 1 ELSE 0 END AS VEHICLE_FATALITIES

            ,CASE WHEN PERSONTYPE = 'Pedestrian' AND (MAJORINJURY='Y' OR MINORINJURY ='Y')THEN 1 ELSE 0 END AS PED_INJURIES
            ,CASE WHEN PERSONTYPE = 'Bicyclist' AND (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS BICYCLE_INJURIES
            ,CASE WHEN PERSONTYPE in ('Driver','Passenger') AND (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS VEHICLE_INJURIES
            ,CASE WHEN PERSONTYPE = 'Driver' AND TICKETISSUED ='Y' THEN 1 ELSE 0 END AS DRIVER_TICKETS
            ,CASE WHEN PERSONTYPE = 'Driver' AND SPEEDING ='Y' THEN 1 ELSE 0 END AS DRIVERS_SPEEDING
            ,CASE WHEN PERSONTYPE = 'Driver' AND IMPAIRED ='Y' THEN 1 ELSE 0 END AS DRIVERS_IMPAIRED

            ,CASE WHEN PERSONTYPE = 'Bicyclist' AND TICKETISSUED ='Y' THEN 1 ELSE 0 END AS BICYCLE_TICKETS
            ,CASE WHEN PERSONTYPE = 'Pedestrian' AND TICKETISSUED ='Y'  THEN 1 ELSE 0 END AS PED_TICKETS
            ,CASE WHEN (MAJORINJURY='Y' OR MINORINJURY ='Y') THEN 1 ELSE 0 END AS TOTAL_INJURIES
            ,CASE WHEN MAJORINJURY='Y' THEN 1 ELSE 0 END AS TOTAL_MAJOR_INJURIES
            ,CASE WHEN MINORINJURY ='Y' THEN 1 ELSE 0 END AS TOTAL_MINOR_INJURIES

            ,CASE WHEN PERSONTYPE = 'Driver' THEN 1 ELSE 0 END AS TOTAL_VEHICLES
            ,CASE WHEN PERSONTYPE = 'Pedestrian' THEN 1 ELSE 0 END AS TOTAL_PEDESTRIANS
            ,CASE WHEN PERSONTYPE = 'Bicyclist' THEN 1 ELSE 0 END AS TOTAL_BICYCLISTS
        FROM source_data.crash_details
    )
    """
    group_by_query = """
    DROP TABLE IF EXISTS tmp.crash_details_agg;
    CREATE  TABLE tmp.crash_details_agg 
    AS (
        SELECT 
            CRIMEID
            ,SUM(DRIVERS_OVER_65) AS DRIVERS_OVER_65
            ,SUM(DRIVERS_UNDER_25) AS DRIVERS_UNDER_25
            ,SUM(PEDS_OVER_65) AS PEDS_OVER_65
            ,SUM(PEDS_UNDER_12) AS PEDS_UNDER_12
            ,SUM(BIKERS_OVER_65) AS BIKERS_OVER_65
            ,SUM(BIKERS_UNDER_18) AS BIKERS_UNDER_18
            ,SUM(OOS_VEHICLES) AS OOS_VEHICLES
            ,SUM(NUM_CARS) AS NUM_CARS
            ,SUM(NUM_SUVS_OR_TRUCKS) AS NUM_SUVS_OR_TRUCKS
            ,SUM(PED_INJURIES) AS PEDESTRIAN_INJURIES
            ,SUM(BICYCLE_INJURIES) AS BICYCLE_INJURIES
            ,SUM(VEHICLE_INJURIES) AS VEHICLE_INJURIES
            ,SUM(PED_FATALITIES) AS PEDESTRIAN_FATALITIES
            ,SUM(BICYCLE_FATALITIES) AS BICYCLE_FATALITIES
            ,SUM(VEHICLE_FATALITIES) AS VEHICLE_FATALITIES
            ,SUM(DRIVER_TICKETS) AS DRIVER_TICKETS
            ,SUM(DRIVERS_SPEEDING) AS DRIVERS_SPEEDING
            ,SUM(DRIVERS_IMPAIRED) AS DRIVERS_IMPAIRED
            ,SUM(BICYCLE_TICKETS) AS BICYCLE_TICKETS
            ,SUM(PED_TICKETS) AS PED_TICKETS
            ,SUM(TOTAL_INJURIES) AS TOTAL_INJURIES
            ,SUM(TOTAL_MAJOR_INJURIES) AS TOTAL_MAJOR_INJURIES
            ,SUM(TOTAL_MINOR_INJURIES) AS TOTAL_MINOR_INJURIES
            ,SUM(TOTAL_VEHICLES) AS TOTAL_VEHICLES
            ,SUM(TOTAL_PEDESTRIANS) AS TOTAL_PEDESTRIANS
            ,SUM(TOTAL_BICYCLISTS) AS TOTAL_BICYCLISTS
            ,ARRAY_AGG(PERSONTYPE) AS PERSONTYPE_ARRAY
            ,ARRAY_AGG(INVEHICLETYPE) AS INVEHICLETYPE_ARRAY
            ,ARRAY_AGG(LICENSEPLATESTATE) AS LICENSEPLATESTATE_ARRAY
        FROM tmp.crash_details
        GROUP BY CRIMEID
    ) ;
    create index crime_id on tmp.crash_details_agg (crimeid);
    """

    join_query = """
    DROP TABLE IF EXISTS tmp.crashes_join;
    CREATE TABLE tmp.crashes_join
    AS (
        SELECT 
            a.OBJECTID
                ,a.CRIMEID
                ,a.REPORTDATE
                ,a.FROMDATE
                ,a.TODATE 
                ,a.ADDRESS
                ,a.mpdlatitude
                ,a.mpdlongitude
                ,CASE WHEN b.CRIMEID IS NULL OR b.BICYCLE_INJURIES < (a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST)
                    THEN (a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST)
                    ELSE b.BICYCLE_INJURIES END AS BICYCLE_INJURIES
                ,CASE WHEN b.CRIMEID IS NULL OR b.VEHICLE_INJURIES < (a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER)
                    THEN (a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER)
                    ELSE b.VEHICLE_INJURIES END AS VEHICLE_INJURIES
                ,CASE WHEN b.CRIMEID IS NULL OR b.PEDESTRIAN_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN)
                    THEN (a.MAJORINJURIES_PEDESTRIAN + a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN)
                    ELSE b.PEDESTRIAN_INJURIES END AS PEDESTRIAN_INJURIES
                ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN
                                                                    +a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER
                                                                    +a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST)
                        THEN (a.MAJORINJURIES_PEDESTRIAN+ a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN
                                                                    +a.MAJORINJURIES_DRIVER+a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER
                                                                    +a.MAJORINJURIES_BICYCLIST + a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST)
                        ELSE b.TOTAL_INJURIES end as TOTAL_INJURIES 

                ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_MAJOR_INJURIES < (a.MAJORINJURIES_PEDESTRIAN+
                                                                    +a.MAJORINJURIES_DRIVER+a.MAJORINJURIESPASSENGER
                                                                    +a.MAJORINJURIES_BICYCLIST)
                        THEN (a.MAJORINJURIES_PEDESTRIAN+a.MAJORINJURIES_DRIVER+a.MAJORINJURIESPASSENGER+a.MAJORINJURIES_BICYCLIST)
                        ELSE b.TOTAL_MAJOR_INJURIES end as TOTAL_MAJOR_INJURIES 

                ,CASE WHEN b.CRIMEID IS NULL OR b.TOTAL_MINOR_INJURIES < (a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN
                                                                    +a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER
                                                                    +a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST)
                    THEN (a.MINORINJURIES_PEDESTRIAN + a.UNKNOWNINJURIES_PEDESTRIAN
                                                                    +a.MINORINJURIES_DRIVER+a.UNKNOWNINJURIES_DRIVER+a.MINORINJURIESPASSENGER+a.UNKNOWNINJURIESPASSENGER
                                                                    +a.MINORINJURIES_BICYCLIST + a.UNKNOWNINJURIES_BICYCLIST)
                    ELSE b.TOTAL_MINOR_INJURIES end as TOTAL_MINOR_INJURIES     

                ,CASE WHEN b.CRIMEID IS NULL OR b.BICYCLE_FATALITIES < a.FATAL_BICYCLIST
                    THEN a.FATAL_BICYCLIST 
                    ELSE b.BICYCLE_FATALITIES END AS BICYCLE_FATALITIES
                ,CASE WHEN b.CRIMEID IS NULL OR b.PEDESTRIAN_FATALITIES < a.FATAL_PEDESTRIAN
                    THEN a.FATAL_PEDESTRIAN 
                    ELSE b.PEDESTRIAN_FATALITIES END AS PEDESTRIAN_FATALITIES
                ,CASE WHEN b.CRIMEID IS NULL OR b.VEHICLE_FATALITIES < (a.FATAL_DRIVER+a.FATALPASSENGER)
                    THEN (a.FATAL_DRIVER+a.FATALPASSENGER) 
                    ELSE b.VEHICLE_FATALITIES END AS VEHICLE_FATALITIES
                ,CASE WHEN b.CRIMEID IS NULL or b.DRIVERS_IMPAIRED < a.DRIVERSIMPAIRED THEN a.DRIVERSIMPAIRED ELSE b.DRIVERS_IMPAIRED END AS DRIVERS_IMPAIRED 
                ,CASE WHEN b.CRIMEID IS NULL or b.DRIVERS_SPEEDING < a.SPEEDING_INVOLVED THEN a.SPEEDING_INVOLVED ELSE b.DRIVERS_SPEEDING END AS DRIVERS_SPEEDING 

                ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_VEHICLES < a.TOTAL_VEHICLES THEN a.TOTAL_VEHICLES ELSE b.TOTAL_VEHICLES END AS TOTAL_VEHICLES 
                ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_BICYCLISTS < a.TOTAL_BICYCLES THEN a.TOTAL_BICYCLES ELSE b.TOTAL_BICYCLISTS END AS TOTAL_BICYCLISTS 
                ,CASE WHEN b.CRIMEID IS NULL or b.TOTAL_PEDESTRIANS < a.TOTAL_PEDESTRIANS THEN a.TOTAL_PEDESTRIANS ELSE b.TOTAL_PEDESTRIANS END AS TOTAL_PEDESTRIANS 
                ,b.DRIVERS_OVER_65
                ,b.DRIVERS_UNDER_25
                ,b.PEDS_OVER_65
                ,b.PEDS_UNDER_12
                ,b.BIKERS_OVER_65
                ,b.BIKERS_UNDER_18
                ,b.OOS_VEHICLES
                ,b.NUM_CARS
                ,b.NUM_SUVS_OR_TRUCKS
                ,b.DRIVER_TICKETS
                ,b.BICYCLE_TICKETS
                ,b.PED_TICKETS
                ,b.PERSONTYPE_ARRAY
                ,b.INVEHICLETYPE_ARRAY
                ,b.LICENSEPLATESTATE_ARRAY
                ,a.INTAPPROACHDIRECTION
                ,a.LOCATIONERROR 
                ,a.LASTUPDATEDATE
                ,a.BLOCKKEY
                ,a.SUBBLOCKKEY
                ,ST_Force2D(a.geography::geometry) as geography

        FROM source_data.crashes_raw a
        LEFT JOIN tmp.crash_details_agg b on a.CRIMEID = b.CRIMEID
        WHERE date_part('year', a.fromdate) >=2015
    ) ;
    CREATE INDEX crashes_geom_idx ON tmp.crashes_join USING GIST (geography);
    """

    # join in the pulsepoint info
    pulsepoint_join_query = """

    DROP TABLE IF EXISTS tmp.crash_pulsepoint_join;
    CREATE TABLE tmp.crash_pulsepoint_join 
    AS (SELECT * 
    FROM (
        SELECT DISTINCT a.* 
        ,b.Agency_Incident_ID as pp_agency_incident_id
        ,b.unit_status_transport as pp_total_injuries
        ,b.transport_unit_is_amr as pp_total_minor_injuries
        ,b.transport_unit_is_non_amr as pp_total_major_injuries
            ,Row_Number() over (partition by a.objectid order by ST_Distance(a.geography, b.geography)) as PP_Call_Distance_Rank
            ,Row_Number() over (partition by a.objectid order by (a.reportdate at time zone 'America/New_York')  - (b.CALL_RECEIVED_DATETIME at time zone 'America/New_York')) as PP_Call_Time_Rank
        FROM tmp.crashes_join a
        LEFT JOIN analysis_data.pulsepoint b on ST_DWITHIN(a.geography, b.geography, 200) 
            AND cast(fromdate as date) =cast((call_received_datetime at time zone 'America/New_York') as date)
            AND (b.CALL_RECEIVED_DATETIME at time zone 'America/New_York')  < (a.reportdate at time zone 'America/New_York') 
    ) tmp WHERE PP_Call_Distance_Rank = 1
    ) ;

    CREATE INDEX IF NOT EXISTS crash_pulsepoint_join_geom_idx ON tmp.crash_pulsepoint_join USING GIST (geography);

    alter table tmp.crash_pulsepoint_join drop column PP_Call_Distance_Rank;
    """

    # First execute the table-specific queries
    engine.execute(add_columns_query)
    print("add columns query complete")
    engine.execute(group_by_query)
    print("group by query complete")
    engine.execute(join_query)
    print("join query complete")
    engine.execute(pulsepoint_join_query)
    print("pulsepoint join query complete")

    # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table
    next_tables = add_location_info(engine=engine,
                                    target_schema='tmp',
                                    target_table='crashes_nbh_ward',
                                    from_schema='tmp',
                                    from_table='crash_pulsepoint_join',
                                    partition_by_field='objectid')
    print("neighborhood-ward query complete")
    next_tables = add_school_info(engine=engine,
                                  target_schema='tmp',
                                  target_table='crashes_schools',
                                  from_schema=next_tables[0],
                                  from_table=next_tables[1])
    print("schools query complete")
    next_tables = add_walkscore_info(engine=engine,
                                     target_schema='tmp',
                                     target_table='crashes_walkscore',
                                     from_schema=next_tables[0],
                                     from_table=next_tables[1])
    print("walkscore query complete")
    next_tables = add_roadway_info(engine=engine,
                                   target_schema='tmp',
                                   target_table='crashes_roadway_info',
                                   from_schema=next_tables[0],
                                   from_table=next_tables[1],
                                   partition_by_field='objectid',
                                   within_distance=0.001)
    print("roadway info query complete")
    next_tables = add_intersection_info(
        engine=engine,
        target_schema='tmp',
        target_table='crashes_intersection_info',
        from_schema=next_tables[0],
        from_table=next_tables[1],
        partition_by_field='objectid',
        within_distance=10)
    print("intersection info query complete")
    row_count = create_final_table(engine=engine,
                                   target_schema='analysis_data',
                                   target_table='dc_crashes_w_details',
                                   from_schema=next_tables[0],
                                   from_table=next_tables[1])
    print("final query complete with row count ", row_count)
예제 #6
0
def generate_pulsepoint_analysis_table(AWS_Credentials: dict, **kwargs):

    # if no environment is specified default to dev
    env = kwargs.get('env', None)
    if env == None:
        env = 'DEV'
    env = env.upper()

    # set up RDS and S3 connections, engines, cursors
    region = AWS_Credentials['region']
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env)

    # flag that some records might be duplicate calls for the same incident
    dupe_check_query = """
    DROP TABLE IF EXISTS tmp.pulsepoint_dupe_check;
    CREATE TABLE tmp.pulsepoint_dupe_check 
    AS (
        SELECT DISTINCT a.* , 
            case 
                when b.incident_id is null then 1 
                when a.num_units_responding = 0 and b.num_units_responding >0 then 0 
                when b.unit_status_transport > a.unit_status_transport then 0
                when b.num_units_responding > a.num_units_responding then 0 
                when b.call_received_datetime < a.call_received_datetime then 0
                else 1 end as KEEP_RECORD_FLAG
        FROM source_data.pulsepoint a
        LEFT JOIN source_data.pulsepoint b on a.incident_id <> b.incident_id 
        and date_part('day', a.call_received_datetime - b.call_received_datetime) = 0
        and date_part('hour', a.call_received_datetime - b.call_received_datetime) = 0
        and date_part('month', a.call_received_datetime - b.call_received_datetime) = 0
        and abs(date_part('minute', a.call_received_datetime - b.call_received_datetime)) <=20
        and ST_DWithin(a.geography, b.geography, 100)
        and a.Agency_ID = b.Agency_ID
        and (a.num_units_responding = 0 or a.unit_ids && b.unit_ids)

    ) ;

    CREATE INDEX IF NOT EXISTS pulsepoint_dupe_check_geom_idx ON tmp.pulsepoint_dupe_check USING GIST (geography);
    """

    # then join to the crashes table
    crashes_join_query = """


    DROP TABLE IF EXISTS tmp.pulsepoint_crash_join;
    CREATE TABLE tmp.pulsepoint_crash_join 
    AS (SELECT * 
    FROM (
        SELECT DISTINCT a.* 
        ,concat(a.agency_id, a.incident_id) as Agency_Incident_ID
            ,b.objectid as Crash_Objectid 
            ,b.geography as Crash_Geo
            ,b.total_injuries as Crash_Total_Injuries
            ,b.total_major_injuries as Crash_Total_Major_Injuries 
            ,b.total_minor_injuries as Crash_Total_Minor_Injuries 
            ,(b.bicycle_fatalities + b.pedestrian_fatalities + b.vehicle_fatalities) as Crash_Total_Fatalities
            ,b.bicycle_injuries as Crash_Bike_Injuries
            ,b.vehicle_injuries as Crash_Car_Injuries
            ,b.pedestrian_injuries as Crash_Ped_Injuries
            ,case when b.total_injuries is null or b.total_injuries < a.unit_status_transport then 1 else 0 end as injuries_mismatch
            ,ST_Distance(a.geography, b.geography) as Distance_To_Crash
            ,(b.reportdate at time zone 'America/New_York')  - (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York')  as Time_Between_Crash_And_Report
            ,b.intersectionid as Crash_Intersection_ID
            ,b.block_objectid as Crash_Block_Objectid
            ,Row_Number() over (partition by a.incident_id, a.agency_id order by ST_Distance(a.geography, b.geography)) as Crash_Distance_Rank
            ,Row_Number() over (partition by a.incident_id, a.agency_id order by (b.reportdate at time zone 'America/New_York')  - (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York')) as Crash_Time_Rank
        FROM tmp.pulsepoint_dupe_check a
        LEFT JOIN analysis_data.dc_crashes_w_details b on ST_DWITHIN(a.geography, b.geography, 200) 
            AND cast(b.fromdate as date) =cast((call_received_datetime at time zone 'America/New_York') as date)
            AND (a.CALL_RECEIVED_DATETIME at time zone 'America/New_York')  < (b.reportdate at time zone 'America/New_York') 
        WHERE a.KEEP_RECORD_FLAG = 1
    ) tmp WHERE Crash_Distance_Rank = 1 and (incident_type in ('TC', 'TCE', 'TCS') or (agency_id = '16000' and incident_type in ('TC', 'TCS', 'TCE', 'RES')))
    ) ;

    CREATE INDEX IF NOT EXISTS pulsepoint_crash_join_geom_idx ON tmp.pulsepoint_crash_join USING GIST (geography);

    alter table tmp.pulsepoint_crash_join drop column KEEP_RECORD_FLAG;
    alter table tmp.pulsepoint_crash_join drop column Crash_Distance_Rank;
    """

    # First execute the table-specific queries
    engine.execute(dupe_check_query)
    print("dupe check query complete")

    engine.execute(crashes_join_query)
    print("join to crashes query complete")

    # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table
    next_tables = add_location_info(engine=engine,
                                    target_schema='tmp',
                                    target_table='pulsepoint_nbh_ward',
                                    from_schema='tmp',
                                    from_table='pulsepoint_crash_join',
                                    partition_by_field='Agency_Incident_ID')
    print("neighborhood-ward query complete")
    next_tables = add_school_info(engine=engine,
                                  target_schema='tmp',
                                  target_table='pulsepoint_schools',
                                  from_schema=next_tables[0],
                                  from_table=next_tables[1])
    print("schools query complete")
    next_tables = add_walkscore_info(engine=engine,
                                     target_schema='tmp',
                                     target_table='pulsepoint_walkscore',
                                     from_schema=next_tables[0],
                                     from_table=next_tables[1])
    print("walkscore query complete")
    next_tables = add_roadway_info(engine=engine,
                                   target_schema='tmp',
                                   target_table='pulsepoint_roadway_info',
                                   from_schema=next_tables[0],
                                   from_table=next_tables[1],
                                   partition_by_field='Agency_Incident_ID',
                                   within_distance=100)
    print("roadway info query complete")
    next_tables = add_intersection_info(
        engine=engine,
        target_schema='tmp',
        target_table='pulsepoint_intersection_info',
        from_schema=next_tables[0],
        from_table=next_tables[1],
        partition_by_field='Agency_Incident_ID',
        within_distance=60)
    print("intersection info query complete")
    next_tables = is_national_park(engine=engine,
                                   target_schema='tmp',
                                   target_table='pulsepoint_national_park',
                                   from_schema=next_tables[0],
                                   from_table=next_tables[1])
    print("national parks info query complete")
    row_count = create_final_table(engine=engine,
                                   target_schema='analysis_data',
                                   target_table='pulsepoint',
                                   from_schema=next_tables[0],
                                   from_table=next_tables[1])
    print("final query complete with row count ", row_count)
예제 #7
0
def refresh_test_db(env: str):
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env.upper())
    db_credentials = get_connection_strings("AWS_PostGIS")
    db_users = db_credentials[env.upper()]['USERS']
    prod_db_host = db_credentials['PROD']['HOST']
    prod_db_port = db_credentials['PROD']['PORT']
    prod_db_name = db_credentials['PROD']['DB']
    prod_engine = create_postgres_engine(destination="AWS_PostGIS", env="PROD")

    create_fdw_query = """
        BEGIN;
        CREATE EXTENSION IF NOT EXISTS postgres_fdw;
        DROP SERVER IF EXISTS prod CASCADE;
        CREATE SERVER prod FOREIGN DATA WRAPPER postgres_fdw OPTIONS (host '{prod_db_host}', dbname '{prod_db_name}');
        COMMIT;
    """.format(prod_db_name=prod_db_name, prod_db_host=prod_db_host)

    engine.execute(create_fdw_query)

    #  create user mappings
    for user_pwd in db_users:
        user = list(user_pwd.keys())[0]

        pwd = user_pwd[user]
        map_user_query = """
        CREATE USER MAPPING FOR {user}
            SERVER prod
            OPTIONS (user '{user}', password '{pwd}');
        """.format(user=user, pwd=pwd)

        engine.execute(map_user_query)

    # pull the schemas off the viz copy of the prod database
    schemas = [(r, 'prod_' + r) for (r, ) in prod_engine.execute(
        "select distinct table_schema from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%'"
    ).fetchall()]

    # map schemas
    for source_schema, destination_schema in schemas:
        create_schema_query = """
        CREATE SCHEMA IF NOT EXISTS {destination_schema};
        GRANT ALL PRIVILEGES ON SCHEMA {destination_schema} TO PUBLIC;
        IMPORT FOREIGN SCHEMA {source_schema}
            FROM SERVER prod
            INTO {destination_schema};
        """.format(source_schema=source_schema,
                   destination_schema=destination_schema)

        engine.execute(create_schema_query)

    # pull all the tables from prod db
    schemas_tables = [(schema, table) for (
        schema, table
    ) in prod_engine.execute(
        "select distinct table_schema,table_name from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%' and table_name not like '[%%]'"
    ).fetchall()]

    #  create and populate tables
    for schema, table in schemas_tables:
        create_populate_tables_query = """
        CREATE TABLE IF NOT EXISTS {schema}."{table}" (LIKE prod_{schema}."{table}");

        DELETE FROM {schema}."{table}";

        INSERT INTO {schema}."{table}"
            SELECT * FROM prod_{schema}."{table}";

        GRANT ALL PRIVILEGES ON {schema}."{table}" TO PUBLIC;
        """.format(schema=schema, table=table)

        print(create_populate_tables_query)
        engine.execute(create_populate_tables_query)
예제 #8
0
def create_test_db(env: str, test_db_name: str):
    # connect to the prod db
    engine = create_postgres_engine(destination="AWS_PostGIS", env="PROD")
    db_credentials = get_connection_strings("AWS_PostGIS")
    # get prod master credentials
    prod_db_host = db_credentials['PROD']['HOST']
    prod_db_port = db_credentials['PROD']['PORT']
    prod_db_name = db_credentials['PROD']['DB']
    prod_db_uid = db_credentials['PROD']['UID']
    prod_db_pwd = db_credentials['PROD']['PWD']
    # get testdb credentials
    test_db_host = db_credentials[env.upper()]['HOST']
    test_db_port = db_credentials[env.upper()]['PORT']
    test_db_name = db_credentials[env.upper()]['DB']
    test_db_uid = db_credentials[env.upper()]['UID']
    test_db_pwd = db_credentials[env.upper()]['PWD']
    test_db_users = db_credentials[env.upper()]['USERS']

    kill_db_query = """
    SELECT	pg_terminate_backend (pid)
    FROM	pg_stat_activity
    WHERE	pg_stat_activity.datname = '{0}';
    """.format(test_db_name)

    # kill
    engine.execute(kill_db_query)

    # drop
    command = 'DROP DATABASE IF EXISTS {}'.format(test_db_name)
    os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={} --no-password --command=\"{}\"'.format(
        prod_db_pwd, prod_db_host, prod_db_port, prod_db_uid, prod_db_name,
        command)
    os.system(os_system_arg)

    # create
    command = 'CREATE DATABASE {}'.format(test_db_name)
    os_system_arg = 'PGPASSWORD=\'{}\' psql --host={} --port={} --username={} --dbname={} --no-password --command=\"{}\"'.format(
        prod_db_pwd, prod_db_host, prod_db_port, prod_db_uid, prod_db_name,
        command)
    os.system(os_system_arg)

    # create users on new db
    for user_pwd in test_db_users:
        user = list(user_pwd.keys())[0]
        pwd = user_pwd[user]
        command = 'CREATE ROLE {} WITH LOGIN ENCRYPTED PASSWORD \'{}\';'.format(
            user, pwd)
        os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format(
            test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name,
            command)
        print(os_system_arg)
        os.system(os_system_arg)

    # install PostGIS extensions
    command = """
    CREATE EXTENSION postgis;
    CREATE EXTENSION fuzzystrmatch;
    CREATE EXTENSION postgis_tiger_geocoder;
    CREATE EXTENSION postgis_topology;
    """
    os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format(
        test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name,
        command)
    os.system(os_system_arg)
    # alter schemas
    command = """
    ALTER SCHEMA tiger OWNER TO rds_superuser;
    ALTER SCHEMA tiger_data OWNER TO rds_superuser;
    ALTER SCHEMA topology OWNER TO rds_superuser;
    """
    os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format(
        test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name,
        command)
    os.system(os_system_arg)
    # create function
    command = 'CREATE FUNCTION exec(text) returns text language plpgsql volatile AS \$f\$ BEGIN EXECUTE \$1; RETURN \$1; END; \$f\$;'
    os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format(
        test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name,
        command)
    os.system(os_system_arg)
    # execute function
    command = """
    SELECT exec('ALTER TABLE ' || quote_ident(s.nspname) || '.' || quote_ident(s.relname) || ' OWNER TO rds_superuser;')
        FROM (
            SELECT nspname, relname
            FROM pg_class c JOIN pg_namespace n ON (c.relnamespace = n.oid) 
            WHERE nspname in ('tiger','topology') AND
            relkind IN ('r','S','v') ORDER BY relkind = 'S')
        s;
    """
    os_system_arg = 'PGPASSWORD=\'{}\' psql -h {} -p {} -U {} --dbname={} --no-password --command=\"{}\"'.format(
        test_db_pwd, test_db_host, test_db_port, test_db_uid, test_db_name,
        command)
    os.system(os_system_arg)

    # get all schemas on prod db
    schemas = [
        r for (r, ) in engine.execute(
            "select distinct table_schema from information_schema.tables where is_insertable_into = 'YES' and table_schema not like 'pg_%%'"
        ).fetchall()
    ]

    # create engine on test db
    test_engine = create_postgres_engine(destination="AWS_PostGIS",
                                         env=env.upper())

    # create schemas
    for schema in schemas:
        create_schema_query = """
        CREATE SCHEMA IF NOT EXISTS {0};
        GRANT ALL PRIVILEGES ON SCHEMA {0} TO PUBLIC;
        """.format(schema)

        test_engine.execute(create_schema_query)
예제 #9
0
def generate_pulsepoint_table (AWS_Credentials:dict, **kwargs):

    # assign optional arguments
    target_table='pulsepoint'
    target_schema=kwargs.get('target_schema', None)
    if target_schema == None:
        target_schema='source_data'
    target_table=kwargs.get('target_table', None)
    if target_table == None:
        target_table='pulsepoint'
    # if no environment is specified default to dev 
    env=kwargs.get('env', None)
    if env == None:
        env='DEV'
    env=env.upper()

    # set up RDS and S3 connections, engines, cursors
    region=AWS_Credentials['region']
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env)


    step1_query ="""
    DROP TABLE IF EXISTS tmp_pulsepoint;
    CREATE TEMP TABLE tmp_pulsepoint ON COMMIT PRESERVE ROWS 
    AS ( 
    SELECT 
        Agency_ID
        ,Incident_ID
        ,Scrape_Datetime
        ,CALL_RECEIVED_DATETIME
        ,CALL_Closed_DATETIME
        ,FullDisplayAddress
        ,longitude
        ,latitude
        ,Incident_Type
        ,Unit
        ,Unit_Status_Transport
        ,Transport_Unit_Is_AMR
        ,Transport_Unit_Is_Non_AMR
        ,Unit_JSON
        ,Num_Units_Responding
        ,geography
    FROM (
        SELECT 
            Agency_ID
            ,Incident_ID
            ,Scrape_Datetime
            ,CALL_RECEIVED_DATETIME
            ,CALL_Closed_DATETIME
            ,FullDisplayAddress
            ,longitude
            ,latitude
            ,Incident_Type
            ,Unit
            ,MAX(Unit_Status_Transport) over (Partition by Agency_ID, Incident_ID) as Unit_Status_Transport
            ,MAX(Transport_Unit_Is_AMR) over (Partition by Agency_ID, Incident_ID) as Transport_Unit_Is_AMR
            ,MAX(Transport_Unit_Is_Non_AMR) over (Partition by Agency_ID, Incident_ID) as Transport_Unit_Is_Non_AMR
            ,ST_SetSRID(ST_MakePoint(longitude, latitude), 4326)::geography as geography
            ,cast(replace(unit,'''','"') as jsonb) as Unit_JSON
            ,JSONB_ARRAY_LENGTH(cast(replace(unit,'''','"') as jsonb)::jsonb) as Num_Units_Responding
            ,ROW_NUMBER() over (Partition by Agency_ID, Incident_ID order by Scrape_Datetime DESC) as Time_Rank
            FROM source_data.pulsepoint_stream
        ) AS tmp
    WHERE Time_Rank = 1
    ) WITH DATA;
    """

    step_2_query = """
    DROP TABLE IF EXISTS tmp_pulsepoint_units;
    CREATE TEMP TABLE tmp_pulsepoint_units ON COMMIT PRESERVE ROWS 
    AS (
        SELECT incident_id, Agency_ID, array_agg((Units#>'{{UnitID}}')::text) as Unit_IDs
        FROM tmp_pulsepoint
        CROSS JOIN json_array_elements(unit_json::json) as Units
        GROUP BY incident_id, Agency_ID
        ) WITH DATA; 
    """

    step_3_query = """
    DROP TABLE IF EXISTS tmp_pulsepoint_units_join;
    CREATE TEMP TABLE tmp_pulsepoint_units_join ON COMMIT PRESERVE ROWS 
    AS (
        SELECT DISTINCT a.*, b.Unit_IDs
        FROM tmp_pulsepoint a
        LEFT JOIN tmp_pulsepoint_units b on a.incident_id = b.incident_id and a.agency_id = b.agency_id
        ) WITH DATA; 
    """

    final_query="""
    DROP TABLE IF EXISTS {0}.{1};

    CREATE TABLE {0}.{1} AS 
        SELECT * FROM tmp_pulsepoint_units_join;

    GRANT ALL PRIVILEGES ON {0}.{1} TO PUBLIC;
    """.format(target_schema, target_table)

    engine.execute(step1_query)
    engine.execute(step_2_query)
    engine.execute(step_3_query)
    engine.execute(final_query)
예제 #10
0
def extract_twitter_json(target_schema: str, source_table: str,
                         target_table: str, AWS_Credentials: dict, **kwargs):

    # assign optional arguments
    source_schema = kwargs.get('source_schema', None)
    if source_schema == None:
        source_schema = 'stg'
    # if no environment is specified default to dev
    env = kwargs.get('env', None)
    if env == None:
        env = 'DEV'
    env = env.upper()

    # set up RDS and S3 connections, engines, cursors
    region = AWS_Credentials['region']
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env)

    # extract the json info
    step1_query = """
    DROP TABLE IF EXISTS tmp.twitter;
    CREATE TABLE tmp.twitter
    AS ( 
    WITH results AS (
        SELECT jsonb_array_elements(test.tweets) AS data, source_file, load_datetime
            FROM
                (
                SELECT data->'data' as tweets, source_file, load_datetime
                from {}."{}"
                ) as test
	)
	SELECT 
            (data->'created_at')::varchar::timestamptz AS created_at
            ,(data->'id_str')::varchar AS tweet_id
			,(data->'user'->'id_str')::varchar AS user_id
			,CASE WHEN (data ? 'retweeted_status') THEN (data->'retweeted_status'->'id_str')::varchar END AS retweeted_status_id
			,CASE WHEN(data->'in_reply_to_status_id_str')::varchar <> 'null'
				THEN (data->'in_reply_to_status_id_str')::varchar END AS in_reply_to_status_id
			,REPLACE(REPLACE(CASE WHEN (data ? 'full_text') THEN (data->'full_text')::varchar 
				WHEN (data ? 'text') THEN (data->'text')::varchar 
				END, '&amp;', '&'),'%%','percent') AS tweet_text
            ,source_file
            ,load_datetime
            ,data 
	    FROM results
        );
    """.format(source_schema, source_table)

    engine.execute(step1_query)

    # geocode records
    records = [
        r for (r, ) in engine.execute(
            "select distinct tweet_text from tmp.twitter").fetchall()
    ]
    print(len(records), " records passed to geocode function")
    geocode_text(engine=engine,
                 records_to_geocode=records,
                 administrative_area='District of Columbia',
                 text_type='Tweet')

    # join the geocoded text back into the main table
    step_2_query = """
    DROP TABLE IF EXISTS tmp.twitter_geocode;
    CREATE  TABLE tmp.twitter_geocode
    AS (
        SELECT DISTINCT a.*
            ,b.point_type
            ,b.point_geography
            ,b.polygon_geography
        FROM tmp.twitter a
        LEFT JOIN source_data.geocoded_text b on a.tweet_text = b.text
        ) ; 
    """

    final_query = """
    CREATE TABLE IF NOT EXISTS {0}.{1} (LIKE tmp.twitter_geocode);

    INSERT INTO {0}.{1} 
        SELECT * FROM tmp.twitter_geocode;

    GRANT ALL PRIVILEGES ON {0}.{1} TO PUBLIC;
    """.format(target_schema, target_table)

    engine.execute(step1_query)
    engine.execute(step_2_query)
    engine.execute(final_query)

    count_query = 'SELECT COUNT(*) FROM {}.{} WHERE source_file like \'%%{}%%\''.format(
        target_schema, target_table, source_table)

    row_count = engine.execute(count_query).fetchone()[0]
    print("{} rows inserted into final table from file {}".format(
        str(row_count), source_table))

    drop_table_query = 'DROP TABLE IF EXISTS {}."{}"'.format(
        source_schema, source_table)
    engine.execute(drop_table_query)
import geopandas as gpd
import pandas as pd
import boto3
import os
from pathlib import Path
from connect_to_rds import get_connection_strings, create_postgres_engine

# set up S3 connection
AWS_Credentials = get_connection_strings("AWS_DEV")
s3 = boto3.resource(
    's3',
    aws_access_key_id=AWS_Credentials['aws_access_key_id'],
    aws_secret_access_key=AWS_Credentials['aws_secret_access_key'])
bucket_name = AWS_Credentials['s3_bucket']
region = AWS_Credentials['region']
engine = create_postgres_engine("AWS_PostGIS", "postgres", "DEV")

resources = {
    'crashes_raw': {
        'url':
        'https://opendata.arcgis.com/datasets/70392a096a8e431381f1f692aaa06afd_24.geojson',
        'prefix': 'source-data/dc-open-data/',
        'metadata': {
            'target_schema': 'source_data',
            "dataset_info": "https://opendata.dc.gov/datasets/crashes-in-dc"
        }
    },
    'crash_details': {
        'url':
        'https://opendata.arcgis.com/datasets/70248b73c20f46b0a5ee895fc91d6222_25.geojson',
        'prefix': 'source-data/dc-open-data/',
예제 #12
0
def csv_to_postGIS(folder_to_load: str, AWS_Credentials: dict, **kwargs):

    # assign optional arguments
    target_schema = kwargs.get('target_schema', None)
    if target_schema == None:
        target_schema = 'stg'
    move_to_folder = kwargs.get('move_to_folder', None)
    clean_columns = kwargs.get('clean_columns', None)
    # if no environment is specified default to dev
    env = kwargs.get('env', None)
    if env == None:
        env = 'DEV'
    env = env.upper()
    # list of all loaded tables
    tables_created = []

    # set up RDS and S3 connections, engines, cursors
    s3_resource = boto3.resource(
        's3',
        aws_access_key_id=AWS_Credentials['aws_access_key_id'],
        aws_secret_access_key=AWS_Credentials['aws_secret_access_key'])
    bucket_name = AWS_Credentials['s3_bucket']
    bucket = s3_resource.Bucket(bucket_name)
    client = boto3.client(
        's3',
        aws_access_key_id=AWS_Credentials['aws_access_key_id'],
        aws_secret_access_key=AWS_Credentials['aws_secret_access_key'])
    region = AWS_Credentials['region']
    connection = create_psycopg2_connection(destination="AWS_PostGIS", env=env)
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env)

    files_to_load = [
        obj.key for obj in bucket.objects.filter(Prefix=folder_to_load)
        if '.csv' in obj.key
    ]

    for object_key in files_to_load:
        # get base file name to use as table name
        stg_tble = os.path.basename(object_key)

        # get the headers
        # get the headers to create columns
        sql_stmt = """SELECT S.* FROM s3object S LIMIT 1"""

        # pull the header row from target csv
        req = client.select_object_content(
            Bucket=bucket_name,
            Key=object_key,
            ExpressionType='SQL',
            Expression=sql_stmt,
            InputSerialization={
                'CSV': {
                    'FileHeaderInfo': 'NONE',
                    'AllowQuotedRecordDelimiter': True
                }
            },
            OutputSerialization={'CSV': {}},
        )

        # format csv headers into a list
        for event in req['Payload']:
            if 'Records' in event:
                file_str = ''.join(
                    event['Records']['Payload'].decode('utf-8')).lower()
                columns_list = file_str.split(',')

        # and then make a column create string out of them
        create_columns_statement = ''
        for column in columns_list:
            create_columns_statement += ',"' + column.replace(
                '\n', '').lower() + '" VARCHAR NULL'

        #generate create table shell script
        create_table_query = """
        DROP TABLE IF EXISTS {0}."{1}";
        CREATE TABLE IF NOT EXISTS {0}."{1}" (
            LOAD_DATETIME TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
            ,source_file varchar DEFAULT '{2}'
            {3}
            );
            """.format(target_schema, stg_tble, object_key,
                       create_columns_statement)

        # create the table
        with connection.cursor() as cursor:
            cursor.execute(create_table_query)
            connection.commit()

        # and then execute the query to fix the column names, if that parameter was passed
        if clean_columns is not None and clean_columns.lower() == 'yes':
            fix_column_names_query = """
            SELECT replace(replace(SQL_Statement::varchar,\'";\', \';\'), \'u_"\', \'u_\') FROM (
            SELECT FORMAT(
                \'ALTER TABLE %%I.%%I.%%I RENAME %%I to u_%%I;\',
                table_catalog,
                table_schema,
                table_name,
                column_name,
                lower(
                    regexp_replace(
                    replace(replace(replace(replace(replace(replace(replace(column_name, \' \', \'_\'),\'?\',\'\'),\'/\',\'_\'),\'&\', \'and\'),\'(\',\'\'),\')\',\'\'),\'"\',\'\')
                    ,\'([[:lower:]])([[:upper:]])\',
                    \'\\1_\\2\',
                    \'xg\'
                    )
                )
                ) AS SQL_Statement
                FROM information_schema.columns
                WHERE table_name = \'{}\' and lower(column_name)!=\'load_datetime\' and lower(column_name) != \'source_file\'
                ) AS tmp;
            """.format(stg_tble)
            list_of_statements = [
                r
                for (r, ) in engine.execute(fix_column_names_query).fetchall()
            ]
            for statement in list_of_statements:
                engine.execute(statement)

            # get the updated list of corrected column names to pass to the copy columns parameter
            get_updated_columns_query = """
            SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = '{0}' AND TABLE_NAME = '{1}'
            """.format(target_schema, stg_tble)

            # put column names of source table in list
            columns_list = [
                r
                for (r,
                     ) in engine.execute(get_updated_columns_query).fetchall()
                if 'load_datetime' not in r.lower()
                if 'source_file' not in r.lower()
            ]
            file_str = ','.join(columns_list).lower()

        # load the target csv into memory
        f = client.get_object(Bucket=bucket_name, Key=object_key)
        # decode it as string
        f2 = pd.read_csv(f['Body'])
        buffer = io.StringIO()
        f2.to_csv(buffer, index=False, header=False)
        buffer.seek(0)

        # copy the file into the table
        with connection.cursor() as cursor:
            cursor.copy_expert(
                ' COPY {0}."{1}" ({2}) FROM STDIN WITH CSV; '.format(
                    target_schema, stg_tble, file_str), buffer)
            connection.commit()

        if move_to_folder is not None:
            s3_resource.Object(bucket_name, move_to_folder +
                               stg_tble).copy_from(CopySource={
                                   'Bucket': bucket_name,
                                   'Key': object_key
                               })
            s3_resource.Object(bucket_name, object_key).delete()

        tables_created.append((target_schema, "{}".format(stg_tble)))

    return tables_created
def generate_moving_violations_table(AWS_Credentials: dict, **kwargs):

    # if no environment is specified default to dev
    env = kwargs.get('env', None)
    if env == None:
        env = 'DEV'
    env = env.upper()

    # set up RDS and S3 connections, engines, cursors
    region = AWS_Credentials['region']
    engine = create_postgres_engine(destination="AWS_PostGIS", env=env)

    # First move all source data records to a temp table
    step_1_query = """

    CREATE TABLE IF NOT EXISTS tmp.moving_violations_need_geo as 
    SELECT * FROM source_data.moving_violations
    WHERE geography IS NULL;

    CREATE TABLE IF NOT EXISTS tmp.moving_violations_has_geo as 
    SELECT * FROM source_data.moving_violations
    WHERE geography IS NOT NULL;

    CREATE INDEX IF NOT EXISTS mv_location_index ON tmp.moving_violations_need_geo (location);
    """

    engine.execute(step_1_query)
    print("temp table created")

    # geocode the locations
    records = [
        loc for (loc, ) in engine.execute(
            "select distinct location from tmp.moving_violations_need_geo where geography is null limit 2000"
        ).fetchall()
    ]
    print(len(records), " records passed to geocode function")
    geocode_text(engine=engine,
                 records_to_geocode=records,
                 administrative_area='District of Columbia',
                 text_type='Moving Violations location')

    # update lat and long values from new data
    step_2_query = """
    UPDATE tmp.moving_violations_need_geo
    SET geography = source_data.geocoded_text.point_geography
    FROM source_data.geocoded_text 
    WHERE source_data.geocoded_text.text = location
    ;

    INSERT INTO tmp.moving_violations_has_geo
    SELECT * FROM tmp.moving_violations_need_geo;

    CREATE INDEX IF NOT EXISTS mv_geom_idx ON tmp.moving_violations_has_geo USING GIST (geography);
    """

    engine.execute(step_2_query)
    print("geo values updated")

    # Then execute the same location-info queries (roadway, schools, neighborhoods) that apply to all analysis tables and create the final table
    next_tables = add_location_info(engine=engine,
                                    target_schema='tmp',
                                    target_table='moving_violations_nbh_ward',
                                    from_schema='tmp',
                                    from_table='moving_violations_has_geo',
                                    partition_by_field='objectid')
    print("neighborhood-ward query complete")
    next_tables = add_school_info(engine=engine,
                                  target_schema='tmp',
                                  target_table='moving_violations_schools',
                                  from_schema=next_tables[0],
                                  from_table=next_tables[1])
    print("schools query complete")
    next_tables = add_roadway_info(
        engine=engine,
        target_schema='tmp',
        target_table='moving_violations_roadway_info',
        from_schema=next_tables[0],
        from_table=next_tables[1],
        partition_by_field='objectid',
        within_distance=50)
    print("roadway info query complete")
    next_tables = add_intersection_info(
        engine=engine,
        target_schema='tmp',
        target_table='moving_violations_intersection_info',
        from_schema=next_tables[0],
        from_table=next_tables[1],
        partition_by_field='objectid',
        within_distance=20)
    print("intersection info query complete")
    row_count = create_final_table(engine=engine,
                                   target_schema='analysis_data',
                                   target_table='moving_violations',
                                   from_schema=next_tables[0],
                                   from_table=next_tables[1])
    print("final query complete with row count ", row_count)