def make_permits_features(con, n_months, max_dist): """ Make permits features Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per inspection and one column per feature. """ dataset = 'permits' date_column = 'issueddate' #Get the time window for which you can generate features min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column) make_inspections_address_nmonths_table(con, dataset, date_column, min_insp, max_insp, n_months=n_months, max_dist=max_dist, load=False) logger.info('Computing distance features for {}'.format(dataset)) freq = group_and_count_from_db(con, dataset, n_months, max_dist) #Rename columns to avoid spaces and capital letters freq.columns = format_column_names(freq.columns) return freq
def make_crime_features(con, n_months, max_dist): """ Make crime features Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per inspection and one column per feature. """ dataset = 'crime' date_column = 'occurred_on' #Get the time window for which you can generate features min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column) make_inspections_address_nmonths_table(con, dataset, date_column, min_insp, max_insp, n_months=n_months, max_dist=max_dist, load=False) logger.info('Computing distance features for {}'.format(dataset)) max_rnum = 15 coalescemissing = "'missing'" # make a table of the more general offense frequencies so we can prune them # also include a column with an array of corresponding detailed levels query = """ DROP TABLE IF EXISTS public.frequentcrimes_orc; CREATE TABLE public.frequentcrimes_orc AS ( WITH t as ( SELECT coalesce(substring(orc from ' \((\w*)\) '), {coalescemissing}) as orc_combined, array_agg(distinct orc) as all_orcs, count(*) as count FROM public.crime GROUP BY orc_combined ORDER BY count desc ) SELECT row_number() OVER () as rnum, t.orc_combined, t.all_orcs, CASE WHEN row_number() OVER () <= {rnum} THEN t.orc_combined ELSE 'other' END AS level FROM t );""".format(rnum=max_rnum, coalescemissing=coalescemissing) cur = con.cursor() cur.execute(query) con.commit() query = """ DROP TABLE IF EXISTS crimefeatures1_{n_months}months_{max_dist}m; DROP TABLE IF EXISTS joinedcrime_{n_months}months_{max_dist}m; -- join the inspections and crime CREATE TEMP TABLE joinedcrime_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT parcel_id, inspection_date, coalesce(substring(event.orc from ' \((\w*)\) '), {coalescemissing}) as orc_combined FROM insp2crime_{n_months}months_{max_dist}m i2e LEFT JOIN LATERAL ( SELECT * FROM public.crime s where s.id=i2e.id ) event ON true ; CREATE INDEX ON joinedcrime_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- make the simple features CREATE TEMP TABLE crimefeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT parcel_id, inspection_date, count(*) as total FROM joinedcrime_{n_months}months_{max_dist}m event GROUP BY parcel_id, inspection_date; CREATE INDEX ON crimefeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- make the categorical (dummified) features CREATE TEMP TABLE crimefeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS -- restrict crime levels to the 15 most common ones, -- using the tables of frequency counts for these levels that we created earlier -- also make sure all 15 levels appear SELECT t2.parcel_id, t2.inspection_date, 'orc_combined_'||t2.level AS categ, coalesce(t1.count,0) as count FROM (SELECT parcel_id, inspection_date, ft.level, count(*) as count FROM joinedcrime_{n_months}months_{max_dist}m event LEFT JOIN public.frequentcrimes_orc ft ON ft.orc_combined = event.orc_combined GROUP BY parcel_id, inspection_date, ft.level ) t1 RIGHT JOIN (SELECT parcel_id, inspection_date, ft.level FROM parcels_inspections JOIN (select distinct level from public.frequentcrimes_orc) ft ON true ) t2 USING (parcel_id, inspection_date,level) ; CREATE INDEX ON crimefeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- Now call the pivot function to create columns with the -- different fire types SELECT colpivot('crimepivot_{n_months}months_{max_dist}m', 'select * from crimefeatures2_{n_months}months_{max_dist}m', array['parcel_id','inspection_date'], array['categ'], 'coalesce(#.count,0)', null ); CREATE INDEX ON crimepivot_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- still need to 'save' the tables into a permanent table DROP TABLE IF EXISTS crimefeatures_{n_months}months_{max_dist}m; CREATE TABLE crimefeatures_{n_months}months_{max_dist}m AS SELECT * FROM crimefeatures1_{n_months}months_{max_dist}m JOIN crimepivot_{n_months}months_{max_dist}m USING (parcel_id, inspection_date) ; """.format(n_months=str(n_months), max_dist=str(max_dist), coalescemissing=coalescemissing) cur.execute(query) con.commit() # fetch the data query = """ SELECT * FROM crimefeatures_{n_months}months_{max_dist}m; """.format(n_months=str(n_months), max_dist=str(max_dist)) df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date']) # clean up the column names df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns) df.columns = map( lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns) # drop the last interim table query = 'drop table crimefeatures_{n_months}months_{max_dist}m'.format( n_months=str(n_months), max_dist=str(max_dist)) cur.execute(query) con.commit() return df
def make_three11_features(con, n_months, max_dist): """ Make three11 features Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per inspection and one column per feature. """ dataset = 'three11' date_column = 'requested_datetime' #Get the time window for which you can generate features min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column) make_inspections_latlong_nmonths_table(con, dataset, date_column, min_insp, max_insp, n_months=n_months, max_dist=max_dist, load=False) max_rnum = 15 logger.info('Computing distance features for {}'.format(dataset)) coalescemissing = "'missing'" # needs to be double-quoted cause SQL-injection # frequent service_codes, so we can prune them (there are too many) make_table_of_frequent_codes( con, col='service_code', intable='public.three11', outtable='public.frequentthree11_service_code', rnum=max_rnum, coalesceto=coalescemissing) cur = con.cursor() query = """ DROP TABLE IF EXISTS three11features1_{n_months}months_{max_dist}m; DROP TABLE IF EXISTS joinedthree11_{n_months}months_{max_dist}m; -- join the inspections and three11 CREATE TEMP TABLE joinedthree11_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT parcel_id, inspection_date, agency_responsible, status, coalesce(service_code,{coalescemissing}) as service_code, CASE WHEN description='Request entered through the Web. Refer to Intake Questions for further description.' THEN 1 ELSE 0 END AS webrequest FROM insp2three11_{n_months}months_{max_dist}m i2e LEFT JOIN LATERAL ( SELECT * FROM public.three11 s where s.id=i2e.id ) event ON true ; CREATE INDEX ON joinedthree11_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- make the simple features CREATE TEMP TABLE three11features1_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT parcel_id, inspection_date, sum(webrequest) as sum_webrequest, avg(webrequest) as avg_webrequest, count(*) as total FROM joinedthree11_{n_months}months_{max_dist}m event GROUP BY parcel_id, inspection_date; CREATE INDEX ON three11features1_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- make the categorical (dummified) features CREATE TEMP TABLE three11features2_{n_months}months_{max_dist}m ON COMMIT DROP AS -- restrict three11 levels to the 15 most common ones, -- using the tables of frequency counts for these levels that we created earlier -- also make sure all levels always appear SELECT t2.parcel_id, t2.inspection_date, 'service_code_'||t2.level AS categ, coalesce(t1.count,0) as count FROM (SELECT parcel_id, inspection_date, ft.level, count(*) as count FROM joinedthree11_{n_months}months_{max_dist}m event LEFT JOIN public.frequentthree11_service_code ft ON ft.raw_level = event.service_code GROUP BY parcel_id, inspection_date, ft.level ) t1 RIGHT JOIN (SELECT parcel_id, inspection_date, ft.level FROM parcels_inspections JOIN (select distinct level from public.frequentthree11_service_code) ft ON true ) t2 USING (parcel_id, inspection_date,level) ; CREATE INDEX ON three11features2_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- Now call the pivot function to create columns with the -- different fire types SELECT colpivot('three11pivot_{n_months}months_{max_dist}m', 'select * from three11features2_{n_months}months_{max_dist}m', array['parcel_id','inspection_date'], array['categ'], 'coalesce(#.count,0)', null ); CREATE INDEX ON three11pivot_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- still need to 'save' the tables into a permanent table DROP TABLE IF EXISTS three11features_{n_months}months_{max_dist}m; CREATE TABLE three11features_{n_months}months_{max_dist}m AS SELECT * FROM three11features1_{n_months}months_{max_dist}m JOIN three11pivot_{n_months}months_{max_dist}m USING (parcel_id, inspection_date) ; """.format(n_months=str(n_months), max_dist=str(max_dist), coalescemissing=coalescemissing) cur.execute(query) con.commit() # fetch the data query = """ SELECT * FROM three11features_{n_months}months_{max_dist}m; """.format(n_months=str(n_months), max_dist=str(max_dist)) df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date']) # clean up the column names df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns) df.columns = map( lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns) # drop the last interim table query = 'drop table three11features_{n_months}months_{max_dist}m'.format( n_months=str(n_months), max_dist=str(max_dist)) cur.execute(query) con.commit() return df
def make_permits_features(con, n_months, max_dist): """ Make permits features Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per inspection and one column per feature. """ dataset = 'permits' date_column = 'issueddate' load_colpivot(con) #Get the time window for which you can generate features min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column) make_inspections_address_nmonths_table(con, dataset, date_column, min_insp, max_insp, n_months=n_months, max_dist=max_dist, load=False) logger.info('Computing distance features for {}'.format(dataset)) cur = con.cursor() insp2tablename = ('insp2{dataset}_{n_months}months' '_{max_dist}m').format(dataset='permits', n_months=str(n_months), max_dist=str(max_dist)) # create a table of the most common proposeduse types, # so we can limit the pivot later to the 15 most common # types of uses cols = [ 'proposeduse', 'statuscurrent', 'workclass', 'permitclass', 'permittype' ] coalescemissing = "'missing'" for col in cols: make_table_of_frequent_codes(con, col=col, intable='public.permits', outtable='public.frequentpermit_%s' % col, rnum=15, coalesce_to=coalescemissing) unionall_template = """ SELECT parcel_id, inspection_date, '{col}_'||coalesce(t2.level,{coalescemissing}) as categ, coalesce(t1.count, 0) as count FROM ( SELECT parcel_id, inspection_date, fs.level, count(*) as count FROM joinedpermits_{n_months}months_{max_dist}m event LEFT JOIN public.frequentpermit_{col} fs ON fs.raw_level = coalesce(event.{col},{coalescemissing}) GROUP BY parcel_id, inspection_date, fs.level ) t1 RIGHT JOIN ( SELECT parcel_id, inspection_date, t.level FROM parcels_inspections JOIN ( SELECT distinct level FROM public.frequentpermit_{col} ) t ON true ) t2 USING (parcel_id, inspection_date, level) """ unionall_statements = unionall_template.format(col=cols[0], n_months=str(n_months), max_dist=str(max_dist), coalescemissing=coalescemissing ) + \ '\n'.join([ 'UNION ALL ( %s )'%unionall_template.format(col=col, n_months=str(n_months), max_dist=str(max_dist), coalescemissing=coalescemissing ) for col in cols[1:] ]) cur = con.cursor() query = """ DROP TABLE IF EXISTS permitfeatures1_{n_months}months_{max_dist}m; CREATE TEMP TABLE permitfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT parcel_id, inspection_date, count(*) as total, avg(completeddate-applieddate) as avg_days_applied_to_completed, avg(completeddate-issueddate) as avg_days_issued_to_completed, avg(issueddate-applieddate) as avg_days_applied_to_issued, avg(expiresdate-issueddate) as avg_days_issued_to_expires, avg(expiresdate-completeddate) as avg_days_completed_to_expires, avg(CASE WHEN issueddate IS NOT NULL THEN 1 ELSE 0 END) as avg_issued, avg(CASE WHEN completeddate IS NOT NULL THEN 1 ELSE 0 END) as avg_completed, avg(CASE WHEN expiresdate IS NOT NULL THEN 1 ELSE 0 END) as avg_expires, avg(totalsqft) as avg_sqft, avg(estprojectcostdec) as avg_estcost, avg(units) as avg_units, avg(CASE WHEN coissueddate IS NOT NULL THEN 1 ELSE 0 END) as avg_is_coissued, avg(substring(fee from 2)::real) as avg_fee, avg(CASE WHEN companyname='OWNER' THEN 1 ELSE 0 END) as avg_owner_is_company FROM insp2permits_{n_months}months_{max_dist}m i2e LEFT JOIN public.permits event USING (id) GROUP BY parcel_id, inspection_date; CREATE INDEX ON permitfeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- make the categorical (dummified) features CREATE TEMP TABLE joinedpermits_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT parcel_id, inspection_date, event.* FROM insp2permits_{n_months}months_{max_dist}m i2e LEFT JOIN LATERAL ( SELECT * FROM public.permits s where s.id=i2e.id ) event ON true ; CREATE INDEX ON joinedpermits_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- Join the permits with the inspections; then concatenate the -- inspections and the various categorical variables (we'll pivot later) CREATE TEMP TABLE permitfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS {unionall_statements}; CREATE INDEX ON permitfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- Now call the pivot function to create columns with the -- different fire types SELECT colpivot('permitpivot_{n_months}months_{max_dist}m', 'select * from permitfeatures2_{n_months}months_{max_dist}m', array['parcel_id','inspection_date'], array['categ'], 'coalesce(#.count,0)', null ); CREATE INDEX ON permitpivot_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- still need to 'save' the tables into a permanent table CREATE TABLE permitfeatures_{n_months}months_{max_dist}m AS SELECT * FROM permitfeatures1_{n_months}months_{max_dist}m JOIN permitpivot_{n_months}months_{max_dist}m USING (parcel_id, inspection_date) ; """.format(n_months=str(n_months), max_dist=str(max_dist), unionall_statements=unionall_statements) cur.execute(query) con.commit() # fetch the data query = """ SELECT * FROM permitfeatures_{n_months}months_{max_dist}m; """.format(n_months=str(n_months), max_dist=str(max_dist)) df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date']) # clean up the column names df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns) df.columns = map( lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns) # drop the last interim table query = 'drop table permitfeatures_{n_months}months_{max_dist}m'.format( n_months=str(n_months), max_dist=str(max_dist)) cur.execute(query) con.commit() return df
def make_sales_features(con, n_months, max_dist): """ Make sales features Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per inspection and one column per feature. """ dataset = 'sales' date_column = 'date_of_sale' insp2tablename = ('insp2{dataset}_{n_months}months' '_{max_dist}m').format(dataset='sales', n_months=str(n_months), max_dist=str(max_dist)) load_colpivot(con) #Get the time window for which you can generate features min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column) make_inspections_address_nmonths_table(con, dataset, date_column, min_insp, max_insp, n_months=n_months, max_dist=max_dist, load=False) logger.info('Computing distance features for {}'.format(dataset)) # there are several columns that we need to prune in terms of codes; # thus, make tables of value counts rnum = 15 coalescemissing_use_code = "11111" # use_code is an int, so hack this coalescemissing = "'missing'" to_dummify_columns = [ 'instrument_type', 'garage_type', 'style', 'grade', 'exterior_wall_type', 'basement', 'heating', 'air_conditioning' ] for col in to_dummify_columns: make_table_of_frequent_codes(con, col=col, intable='public.sales', outtable='public.frequentsales_%s' % col, rnum=rnum, coalesce_to=coalescemissing) # use_code needs special treatment because it's an int make_table_of_frequent_codes(con, col='use_code', intable='public.sales', outtable='public.frequentsales_use_code', coalesceto=coalescemissing_use_code, rnum=rnum, to_other="9999") cur = con.cursor() # let's generate all the 'simple' features we might want; # each column will be named similar to 'avg_total_rooms' coltemplate = "{fun}({col}) AS {fun}_{col}" cols = [ 'number_of_parcels', 'appraisal_area', 'total_sales_records', 'sale_price', 'total_rooms', 'full_bath', 'half_bath', 'fireplaces', 'garage_capacity', 'num_stories', 'year_built', 'finished_sq_ft', 'total_finish_area', 'first_floor_area', 'half_floor_area', 'finished_basement' ] funs = [ 'avg' ] # ,'sum','min','max','stddev'] # could do more, but probably not necessary featureselects = ',\n'.join( coltemplate.format(fun=f, col=c) for f, c in itertools.product(funs, cols)) # This is a template for a pivot table. In the sales table, we have several categorical columns. # We need to pivot these into columns, with counts grouped by parcel_id and inspection_date. # As a first step, we take make a table for each categorical column that we want to pivot. # Each such table has columns (parcel_id, inspection_date, categ, count), where categ is # the level of our categorical column, and count is the number of times that level appears # for index (parcel_id, inspection_date). (We create a new level for 'null' rows.) # Here, we just define a template for this table query; we'll use it below. # {col} will be the categorical column name; joinedsales_Xmonths_Ym a join between sales and # insp2sales_Xmonths_Ym. unionall_template = """ SELECT parcel_id, inspection_date, '{col}_'||coalesce(t2.level,{coalescemissing}) as categ, coalesce(t1.count, 0) as count FROM ( SELECT parcel_id, inspection_date, fs.level, count(*) as count FROM joinedsales_{n_months}months_{max_dist}m event LEFT JOIN public.frequentsales_{col} fs ON fs.raw_level = coalesce(event.{col},{coalescemissing}) GROUP BY parcel_id, inspection_date, fs.level ) t1 RIGHT JOIN ( SELECT parcel_id, inspection_date, t.level FROM parcels_inspections JOIN ( SELECT distinct level FROM public.frequentsales_{col} ) t ON true ) t2 USING (parcel_id, inspection_date, level) """ unionall_statements = '\n'.join([ 'UNION ALL ( %s )' % unionall_template.format(col=col, n_months=str(n_months), max_dist=str(max_dist), coalescemissing=coalescemissing) for col in to_dummify_columns ]) query = """ DROP TABLE IF EXISTS salesfeatures1_{n_months}months_{max_dist}m; DROP TABLE IF EXISTS joinedsales_{n_months}months_{max_dist}m; -- join the inspections and sales CREATE TEMP TABLE joinedsales_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT parcel_id, inspection_date, event.* FROM insp2sales_{n_months}months_{max_dist}m i2e LEFT JOIN LATERAL ( SELECT * FROM public.sales s where s.id=i2e.id ) event ON true ; CREATE INDEX ON joinedsales_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- make the simple features CREATE TEMP TABLE salesfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT parcel_id, inspection_date, count(*) as total, {featureselects} FROM joinedsales_{n_months}months_{max_dist}m event GROUP BY parcel_id, inspection_date; CREATE INDEX ON salesfeatures1_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- make the categorical (dummified) features CREATE TEMP TABLE salesfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS -- now, we have a few columns with too many levels; we restrict these levels to the 15 most common ones, -- using the tables of frequency counts for these levels that we created earlier -- use_code is special, as it's an int (and we want it as varchar) SELECT parcel_id, inspection_date, 'use_code_'||coalesce(t2.level::varchar,'missing') as categ, coalesce(t1.count, 0) as count FROM ( SELECT parcel_id, inspection_date, fs.level, count(*) as count FROM joinedsales_{n_months}months_{max_dist}m event LEFT JOIN public.frequentsales_use_code fs ON fs.raw_level = coalesce(event.use_code,{coalescemissing_use_code}) GROUP BY parcel_id, inspection_date, fs.level ) t1 RIGHT JOIN ( SELECT parcel_id, inspection_date, t.level FROM parcels_inspections JOIN ( SELECT distinct level FROM public.frequentsales_use_code ) t ON true ) t2 USING (parcel_id, inspection_date, level) {unionall_statements} -- these are all the columns that we defined above ; CREATE INDEX ON salesfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- Now call the pivot function to create columns with the -- different fire types SELECT colpivot('salespivot_{n_months}months_{max_dist}m', 'select * from salesfeatures2_{n_months}months_{max_dist}m', array['parcel_id','inspection_date'], array['categ'], 'coalesce(#.count,0)', null ); CREATE INDEX ON salespivot_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- still need to 'save' the tables into a permanent table DROP TABLE IF EXISTS salesfeatures_{n_months}months_{max_dist}m; CREATE TABLE salesfeatures_{n_months}months_{max_dist}m AS SELECT * FROM salesfeatures1_{n_months}months_{max_dist}m JOIN salespivot_{n_months}months_{max_dist}m USING (parcel_id, inspection_date) ; """.format(n_months=str(n_months), max_dist=str(max_dist), featureselects=featureselects, coalescemissing_use_code=coalescemissing_use_code, unionall_statements=unionall_statements) cur.execute(query) con.commit() # fetch the data query = """ SELECT * FROM salesfeatures_{n_months}months_{max_dist}m; """.format(n_months=str(n_months), max_dist=str(max_dist)) df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date']) # clean up the column names df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns) df.columns = map( lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns) # drop the last interim table query = 'drop table salesfeatures_{n_months}months_{max_dist}m'.format( n_months=str(n_months), max_dist=str(max_dist)) cur.execute(query) con.commit() return df
def make_inspections_features(con, n_months, max_dist): """ Make inspections features Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per inspection and one column per feature. """ dataset = 'inspections_views.events_parcel_id' date_column = 'date' ## ------------------------------------------------------------------------ ## Make the parcel_id-to-nearby-houses table, if it's not there yet. ## ------------------------------------------------------------------------ query = """ CREATE TABLE insp2houses_{max_dist}m AS SELECT feature_y.parcel_id, count(*) as parcels FROM ( SELECT t.parcel_id, p.geom FROM (SELECT DISTINCT parcel_id FROM parcels_inspections) t LEFT JOIN shape_files.parcels_cincy p ON t.parcel_id=p.parcelid ) feature_y LEFT JOIN shape_files.parcels_cincy parcels ON ST_DWithin(feature_y.geom, parcels.geom, {max_dist}*3.281::double precision) AND feature_y.parcel_id <> parcels.parcelid GROUP BY feature_y.parcel_id ; CREATE INDEX ON insp2houses_{max_dist}m (parcel_id); """.format(max_dist=max_dist) #Create a cursor cur = con.cursor() #Get the current schema cur.execute('SELECT current_schema;') current_schema = cur.fetchone()[0] #Build the table name table_name = 'insp2houses_{max_dist}m'.format(max_dist=max_dist) # check if table already exists in current schema; # if not, create it if table_name not in tables_in_schema(current_schema): logging.info("Table %s does not exist yet, generating." % table_name) cur.execute(query) else: logging.info("Table %s already exists, skipping." % table_name) con.commit() ## ------------------------------------------------------------------------ ## Make the table of nearby events, and the features. ## ------------------------------------------------------------------------ #Get the time window for which you can generate features min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column) query = """ DROP TABLE IF EXISTS inspfeatures1_{n_months}months_{max_dist}m; CREATE TEMP TABLE inspfeatures1_{n_months}months_{max_dist}m ON COMMIT DROP AS SELECT t2.parcel_id, t2.inspection_date, t2.event, coalesce(t1.count, 0) as count, (coalesce(t1.count, 0)+1.0) / (coalesce(t2.parcels,0)+5.0) as regularized_count_per_houses FROM ( SELECT feature_y.parcel_id, feature_y.inspection_date, coalesce(realinspections.event,'missing') as event, count(*) as count FROM ( SELECT t.*, p.geom, ih.parcels FROM parcels_inspections t LEFT JOIN shape_files.parcels_cincy p ON t.parcel_id=p.parcelid LEFT JOIN insp2houses_{max_dist}m ih USING (parcel_id) ) feature_y JOIN ( SELECT insp.*, p.geom FROM inspections_views.events_parcel_id insp JOIN shape_files.parcels_cincy p ON insp.parcel_no=p.parcelid ) realinspections ON realinspections.date < feature_y.inspection_date AND (feature_y.inspection_date - '{n_months} month'::interval) <= realinspections.date AND ST_DWithin(feature_y.geom, realinspections.geom, {max_dist}*3.281::double precision) WHERE feature_y.inspection_date BETWEEN '{min_date}' AND '{max_date}' GROUP BY feature_y.parcel_id, feature_y.inspection_date, realinspections.event ) t1 RIGHT JOIN (SELECT parcel_id, inspection_date, ft.event, parcels FROM parcels_inspections JOIN (select distinct coalesce(event,'missing') as event from inspections_views.events_parcel_id) ft ON true JOIN insp2houses_{max_dist}m USING (parcel_id) ) t2 USING (parcel_id, inspection_date, event) ; CREATE TEMP TABLE inspfeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS ( SELECT parcel_id, inspection_date, event, count FROM inspfeatures1_{n_months}months_{max_dist}m UNION ALL ( SELECT parcel_id, inspection_date, event||'_per_houses' as event, regularized_count_per_houses AS count FROM inspfeatures1_{n_months}months_{max_dist}m ) ) ; CREATE INDEX ON inspfeatures2_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- Now call the pivot function to create columns with the -- different inspection events SELECT colpivot('insppivot_{n_months}months_{max_dist}m', 'select * from inspfeatures2_{n_months}months_{max_dist}m', array['parcel_id','inspection_date'], array['event'], '#.count', null ); -- Note: Not coalescing the counts, as the _per_houses shouldn't be -- set to 0. We'll have to leave it to later imputation. CREATE INDEX ON insppivot_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- still need to 'save' the tables into a permanent table DROP TABLE IF EXISTS inspfeatures_{n_months}months_{max_dist}m; CREATE TABLE inspfeatures_{n_months}months_{max_dist}m AS SELECT * FROM insppivot_{n_months}months_{max_dist}m ip1 ; """.format(n_months=str(n_months), max_dist=max_dist, min_date=str(min_insp), max_date=str(max_insp)) cur.execute(query) con.commit() # fetch the data query = """ SELECT * FROM inspfeatures_{n_months}months_{max_dist}m; """.format(n_months=str(n_months), max_dist=max_dist) df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date']) # clean up the column names df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns) df.columns = map( lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns) # drop the last interim table query = 'drop table inspfeatures_{n_months}months_{max_dist}m'.format( n_months=str(n_months), max_dist=str(max_dist)) cur.execute(query) con.commit() return df
def make_fire_features(con, n_months, max_dist): """ Make Fire features Input: db_connection: connection to postgres database. "set schema ..." must have been called on this connection to select the correct schema from which to load inspections Output: A pandas dataframe, with one row per inspection and one column per feature. """ dataset = 'fire' date_column = 'incident_date' coalescemissing = "'missing'" #Get the time window for which you can generate features min_insp, max_insp = check_date_boundaries(con, n_months, dataset, date_column) make_inspections_address_nmonths_table(con, dataset, date_column, min_insp, max_insp, n_months=n_months, max_dist=max_dist, load=False) logger.info('Computing distance features for {}'.format(dataset)) insp2tablename = ('insp2{dataset}_{n_months}months' '_{max_dist}m').format(dataset='fire', n_months=str(n_months), max_dist=str(max_dist)) # add the colpivot function to our Postgres schema load_colpivot(con) cur = con.cursor() # create a table of the most common fire types, # so we can limit the pivot later to the 15 most common # types of incidents make_table_of_frequent_codes(con, col='incident_type_desc', intable='public.fire', outtable='public.frequentfiretypes', coalesce_to=coalescemissing, rnum=15) # also make sure that the fire data has an index on the description, # as we want to join on it query = """ CREATE INDEX firetype_idx ON public.fire (incident_type_desc); """ try: cur.execute(query) con.commit() except (InternalError, ProgrammingError) as e: logger.warning("Catching Exception: " + e.message) logger.warning(" - CONTINUING, NOT RE-RUNNING firetype_idx QUERY.....") con.rollback() # now on to the actual feature generation query = """ DROP TABLE IF EXISTS firefeatures_{n_months}months_{max_dist}m; -- link parcels and events within the right radius CREATE TEMP TABLE joinedtable ON COMMIT DROP AS SELECT parcel_id, inspection_date, event.* FROM insp2fire_{n_months}months_{max_dist}m i2e LEFT JOIN LATERAL ( SELECT * FROM public.fire s where s.id=i2e.id ) event ON true ; CREATE INDEX ON joinedtable (parcel_id, inspection_date); -- group by inspections and fire types (we'll pivot later) -- make sure to include all types CREATE TEMP TABLE firetypes_{n_months}months_{max_dist}m ON COMMIT DROP AS ( SELECT t2.parcel_id, t2.inspection_date, 'incident_type_'||t2.level AS incident_type_desc, coalesce(t1.count, 0) as count FROM ( SELECT parcel_id, inspection_date, frequentfires.level, count(*) as count FROM joinedtable event LEFT JOIN public.frequentfiretypes frequentfires ON frequentfires.raw_level = coalesce(event.incident_type_desc, {coalescemissing}) GROUP BY parcel_id, inspection_date, frequentfires.level ) t1 RIGHT JOIN ( SELECT parcel_id, inspection_date, ft.level FROM parcels_inspections JOIN (SELECT DISTINCT level FROM public.frequentfiretypes) ft ON true ) t2 USING (parcel_id, inspection_date, level) ); CREATE INDEX ON firetypes_{n_months}months_{max_dist}m (parcel_id, inspection_date); -- Now call the pivot function to create columns with the -- different fire types SELECT colpivot('firefeatures_{n_months}months_{max_dist}m', 'select * from firetypes_{n_months}months_{max_dist}m', array['parcel_id','inspection_date'], array['incident_type_desc'], 'coalesce(#.count,0)', null ); CREATE INDEX ON firefeatures_{n_months}months_{max_dist}m (parcel_id,inspection_date); -- now we do some simple features DROP TABLE IF EXISTS firefeatures2_{n_months}months_{max_dist}m; CREATE TEMP TABLE firefeatures2_{n_months}months_{max_dist}m ON COMMIT DROP AS ( SELECT parcel_id, inspection_date, count(*) as total, -- note that total includes the non-frequent incident types avg( extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60 ) as avg_clear_time_minutes, max( extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60 ) as max_clear_time_minutes, min( extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60 ) as min_clear_time_minutes, stddev( extract(epoch from event.unit_clear_date_time-event.alarm_date_time)::int/60 ) as stddev_clear_time_minutes FROM joinedtable event GROUP BY parcel_id, inspection_date ); CREATE INDEX ON firefeatures2_{n_months}months_{max_dist}m (parcel_id,inspection_date); -- The pivot function only creates a temp table, -- so we still need to save it into a proper table. -- Also, this is a good time to join in the other -- features we want. CREATE TABLE firefeatures_{n_months}months_{max_dist}m AS SELECT * FROM firefeatures_{n_months}months_{max_dist}m JOIN firefeatures2_{n_months}months_{max_dist}m USING (parcel_id, inspection_date) ; """.format(insp2tablename=insp2tablename, n_months=str(n_months), max_dist=str(max_dist), coalescemissing=coalescemissing) cur.execute(query) con.commit() query = """ SELECT * FROM firefeatures_{n_months}months_{max_dist}m; """.format(n_months=str(n_months), max_dist=str(max_dist)) # fetch the data df = pd.read_sql(query, con, index_col=['parcel_id', 'inspection_date']) # clean up the column names df.columns = map(lambda x: x.replace(' ', '_').lower(), df.columns) df.columns = map( lambda x: ''.join(c for c in x if c.isalnum() or c == '_'), df.columns) # drop the last interim table query = 'drop table firefeatures_{n_months}months_{max_dist}m'.format( n_months=str(n_months), max_dist=str(max_dist)) cur.execute(query) con.commit() return df