def __init__(self, schema, start_date, end_date): if schema not in existing_feature_schemas(): raise SchemaMissing(schema) # All querying is done using a raw connection. # In this connection set to use the relevant schema. # This makes sure that we grab the features from the correct schema. engine = create_engine(uri) self.con = engine.raw_connection() self.con.cursor().execute("SET SCHEMA '{}'".format(schema)) self.start_date = start_date self.end_date = end_date
def generate_features(features_to_generate, n_months, max_dist, inspection_date=None, insp_set='all_inspections'): """ Generate labels and features for all inspections in the inspections database. If inspection_date is passed, features will be generated as if an inspection will occur on that day """ #select schema #depending on the value of inspection date if insp_set=='all_inspections': if inspection_date is None: schema = "features" else: schema = "features_{}".format(inspection_date.strftime('%d%b%Y')).lower() elif insp_set=='field_test': if inspection_date is None: schema = "features_field_test" else: schema = "features_field_test_{}".format(inspection_date.strftime('%d%b%Y')).lower() # use this engine for all data storing (somehow does # not work with the raw connection we create below) engine = create_engine(uri) # all querying is done using a raw connection. in this # connection set to use the relevant schema # this makes sure that we grab the "inspections_parcels" # table from the correct schema in all feature creators con = engine.raw_connection() if schema not in existing_feature_schemas(): #Create schema here cur = con.cursor() cur.execute("CREATE SCHEMA %s;" % schema) con.commit() cur.close() logging.info('Creating schema %s' % schema) else: logging.info('Using existing schema') #Note on SQL injection: schema is either features or features_DATE #date is generated using datetime.datetime.strptime, so if somebody #tries to inject SQL there, it will fail con.cursor().execute("SET SCHEMA '{}'".format(schema)) #Print the current schema by reading it from the db cur = con.cursor() cur.execute('SELECT current_schema;') current_schema = cur.fetchone()[0] logger.info(('Starting feature generation in {}. ' 'n_monts={}. max_dist={}').format(current_schema, n_months, max_dist)) #Get existing tables existing_tables = tables_in_schema(con, schema) # make a new table that contains one row for every parcel in Cincinnati # this table has three columns: parcel_id, inspection_date, viol_outcome # inspection_date is the one given as a parameter and # is the same for all parcels if 'parcels_inspections' not in existing_tables: logger.info('Creating parcels_inspections table...') if inspection_date is None: inspections = outcome.generate_labels() else: if insp_set=='all_inspections': inspections = outcome.make_fake_inspections_all_parcels_cincy(inspection_date) elif insp_set=='field_test': inspections = outcome.load_inspections_from_field_test(inspection_date) inspections.to_sql("parcels_inspections", engine, chunksize=50000, if_exists='fail', index=False, schema=schema) logging.debug("... table has {} rows".format(len(inspections))) #Create an index to make joins with events_Xmonths_* tables faster cur.execute('CREATE INDEX ON features.parcels_inspections (parcel_id);') cur.execute('CREATE INDEX ON features.parcels_inspections (inspection_date);') con.commit() else: logger.info('parcels_inspections table already exists, skipping...') for feature in features_to_generate: logging.info("Generating {} features".format(feature.table)) #Try generating features with the n_months argument try: logging.info(("Generating {} " "features for {} months " "and within {} m").format(feature.table, n_months, max_dist)) feature_data = feature.generator_function(con, n_months, max_dist) table_to_save = '{}_{}m_{}months'.format(feature.table, max_dist, n_months) #If it fails, feature is not spatiotemporal, send only connection except Exception, e: table_to_save = feature.table logging.info("Failed to call function with months and dist: {}".format(str(e))) feature_data = feature.generator_function(con) #Every generator function must have a column with parcel_id, #inspection_date and the correct number of rows as their #corresponding parcels_inspections table in the schema being used # TO DO: check that feature_data has the right shape and indexes if table_to_save in existing_tables: logger.info('Features table {} already exists. Skipping...'.format(feature.table)) else: feature_data.to_sql(table_to_save, engine, chunksize=50000, if_exists='replace', index=True, schema=schema, #Force saving inspection_date as timestamp without timezone dtype={'inspection_date': types.TIMESTAMP(timezone=False)}) logging.debug("{} table has {} rows".format(table_to_save, len(feature_data)))
def generate_features(features_to_generate, n_months, max_dist, inspection_date=None, insp_set='all_inspections'): """ Generate labels and features for all inspections in the inspections database. If inspection_date is passed, features will be generated as if an inspection will occur on that day """ #select schema #depending on the value of inspection date if insp_set == 'all_inspections': if inspection_date is None: schema = "features" else: schema = "features_{}".format( inspection_date.strftime('%d%b%Y')).lower() elif insp_set == 'field_test': if inspection_date is None: schema = "features_field_test" else: schema = "features_field_test_{}".format( inspection_date.strftime('%d%b%Y')).lower() # use this engine for all data storing (somehow does # not work with the raw connection we create below) engine = create_engine(uri) # all querying is done using a raw connection. in this # connection set to use the relevant schema # this makes sure that we grab the "inspections_parcels" # table from the correct schema in all feature creators con = engine.raw_connection() # con = engine.connect() if schema not in existing_feature_schemas(): #Create schema here cur = con.cursor() cur.execute("CREATE SCHEMA %s;" % schema) con.commit() cur.close() logging.info('Creating schema %s' % schema) else: logging.info('Using existing schema') #Note on SQL injection: schema is either features or features_DATE #date is generated using datetime.datetime.strptime, so if somebody #tries to inject SQL there, it will fail con.cursor().execute("SET SCHEMA '{}'".format(schema)) #Print the current schema by reading it from the db cur = con.cursor() cur.execute('SELECT current_schema;') current_schema = cur.fetchone()[0] logger.info(('Starting feature generation in {}. ' 'n_months={}. max_dist={}').format(current_schema, n_months, max_dist)) #Get existing tables existing_tables = tables_in_schema(schema) # set the search path, otherwise won't find ST_DWithin() cur = con.cursor() cur.execute("SET search_path TO {schema}, public;".format(schema=schema)) con.commit() # make a new table that contains one row for every parcel in Cincinnati # this table has three columns: parcel_id, inspection_date, viol_outcome # inspection_date is the one given as a parameter and # is the same for all parcels if 'parcels_inspections' not in existing_tables: logger.info('Creating parcels_inspections table...') if inspection_date is None: inspections = outcome.generate_labels() else: if insp_set == 'all_inspections': inspections = outcome.make_fake_inspections_all_parcels_cincy( inspection_date) elif insp_set == 'field_test': inspections = outcome.load_inspections_from_field_test( inspection_date) inspections.to_sql("parcels_inspections", engine, chunksize=50000, if_exists='fail', index=False, schema=schema) logging.debug("... table has {} rows".format(len(inspections))) #Create an index to make joins with events_Xmonths_* tables faster cur.execute('CREATE INDEX ON parcels_inspections (parcel_id);') cur.execute('CREATE INDEX ON parcels_inspections (inspection_date);') cur.execute( 'CREATE INDEX ON parcels_inspections (parcel_id, inspection_date);' ) con.commit() else: logger.info('parcels_inspections table already exists, skipping...') for feature in features_to_generate: logging.info("Generating {} features".format(feature.table)) #Try generating features with the n_months argument try: logging.info(("Generating {} " "features for {} months " "and within {} m").format(feature.table, n_months, max_dist)) feature_data = feature.generator_function(con, n_months, max_dist) table_to_save = '{}_{}m_{}months'.format(feature.table, max_dist, n_months) #If it fails, feature is not spatiotemporal, send only connection except Exception, e: table_to_save = feature.table logging.info( "Failed to call function with months and dist: {}".format( str(e))) feature_data = feature.generator_function(con) #Every generator function must have a column with parcel_id, #inspection_date and the correct number of rows as their #corresponding parcels_inspections table in the schema being used # TO DO: check that feature_data has the right shape and indexes if table_to_save in existing_tables: logger.info( 'Features table {} already exists. Replacing...'.format( feature.table)) feature_data.to_sql( table_to_save, engine, chunksize=50000, if_exists='replace', index=True, schema=schema, #Force saving inspection_date as timestamp without timezone dtype={'inspection_date': types.TIMESTAMP(timezone=False)}) logging.debug("{} table has {} rows".format(table_to_save, len(feature_data)))