def parcels_derived_features(control, transactions_df): 'return new df by merging df and the geo features' # merge in census tract features census_tract_df = pd.read_csv( control.path_in_parcels_features_census_tract, index_col=0) check_feature_names(transactions_df) check_feature_names(census_tract_df) m1 = transactions_df.merge( census_tract_df, how='inner', left_on=transactions_df[transactions.census_tract], right_on=census_tract_df.census_tract, ) check_feature_names(m1) print 'm1 shape', m1.shape cc('commercial', m1) # merge in zip5 features zip5_df = pd.read_csv(control.path_in_parcels_features_zip5, index_col=0) check_feature_names(m1) check_feature_names(zip5_df) m2 = m1.merge( zip5_df, how='inner', left_on=m1[transactions.zip5], right_on=zip5_df.zip5, ) # remove duplicated field zip5_x, zip5_y assert 'zip5_x' in m2.columns assert 'zip5_y' in m2.columns assert (m2.zip5_x == m2.zip5_y).all() assert 'zip5' not in m2.columns pu.df_remove_column(m2, 'zip5_x') pu.df_rename_column(m2, 'zip5_y', 'zip5') check_feature_names(m2) print 'm2 shape', m2.shape return m2
def parcels_derived_features(control, transactions_df): 'return new df by merging df and the geo features' # merge in census tract features census_tract_df = pd.read_csv(control.path_in_parcels_features_census_tract, index_col=0) check_feature_names(transactions_df) check_feature_names(census_tract_df) m1 = transactions_df.merge( census_tract_df, how='inner', left_on=transactions_df[transactions.census_tract], right_on=census_tract_df.census_tract, ) check_feature_names(m1) print 'm1 shape', m1.shape cc('commercial', m1) # merge in zip5 features zip5_df = pd.read_csv(control.path_in_parcels_features_zip5, index_col=0) check_feature_names(m1) check_feature_names(zip5_df) m2 = m1.merge( zip5_df, how='inner', left_on=m1[transactions.zip5], right_on=zip5_df.zip5, ) # remove duplicated field zip5_x, zip5_y assert 'zip5_x' in m2.columns assert 'zip5_y' in m2.columns assert (m2.zip5_x == m2.zip5_y).all() assert 'zip5' not in m2.columns pu.df_remove_column(m2, 'zip5_x') pu.df_rename_column(m2, 'zip5_y', 'zip5') check_feature_names(m2) print 'm2 shape', m2.shape return m2
def main(argv): control = make_control(argv) sys.stdout = Logger(base_name=control.arg.base_name) print control # NOTE: Organize the computation to minimize memory usage # so that this code can run on smaller-memory systems def ps(name, value): s = value.shape print ' %20s shape (%d, %d)' % (name, s[0], s[1]) # create dataframes n_read_if_test = 10000 deeds_g_al = deeds.read_g_al( control.path, n_read_if_test if control.test else None, ) parcels_sfr = parcels.read( control.path, 10000 if control.test else None, just_sfr=True, ) ps('original deeds g al', deeds_g_al) ps('original parcels sfr', parcels_sfr) # augment parcels to include a zip5 field (5-digit zip code) # drop samples without a zipcode # rationale: we use the zip5 to join the features derived from parcels # and zip5 is derived from zipcode zipcode_present = parcels_sfr[parcels.zipcode].notnull() parcels_sfr = parcels_sfr[zipcode_present] parcels.add_zip5(parcels_sfr) # augment parcels and deeds to include a better APN print 'adding best apn column for parcels' new_column_parcels = best_apn(parcels_sfr, parcels.apn_formatted, parcels.apn_unformatted) parcels_sfr.loc[:, parcels. best_apn] = new_column_parcels # generates an ignorable warning print 'adding best apn column for deeds' new_column_deeds = best_apn(deeds_g_al, deeds.apn_formatted, deeds.apn_unformatted) deeds_g_al.loc[:, deeds.best_apn] = new_column_deeds ps('revised deeds_g_al', deeds_g_al) ps('revised parcels_sfr', parcels_sfr) # join the deeds and parcels files print 'starting to merge' check_feature_names(deeds_g_al) check_feature_names(parcels_sfr) m1 = deeds_g_al.merge(parcels_sfr, how='inner', left_on=deeds.best_apn, right_on=parcels.best_apn, suffixes=('_deed', '_parcel')) check_feature_names(m1) del deeds_g_al del parcels_sfr ps('m1 merge deed + parcels', m1) # add in derived parcels features m2 = parcels_derived_features(control, m1) check_feature_names(m2) ps('ms added parcels_derived', m2) del m1 # add in census data census_features_df = read_census_features(control) m3 = m2.merge( census_features_df, left_on=transactions.census_tract, right_on="census_tract", ) assert 'census_tract_x' in m3.columns assert 'census_tract_y' in m3.columns assert (m3.census_tract_x == m3.census_tract_y).all() assert 'census_tract' not in m3.columns pu.df_remove_column(m3, 'census_tract_x') pu.df_rename_column(m3, 'census_tract_y', 'census_tract') check_feature_names(m3) del m2 ps('m3 merged census features', m3) # add in GPS coordinates geocoding_df = read_geocoding(control) m4 = m3.merge( geocoding_df, left_on="best_apn", right_on="G APN", ) del geocoding_df del m3 ps('m4 merged geocoding', m4) final = m4 print 'final columns' for c in final.columns: print c, print cc('fraction', final) # verify that fraction_owner_occupied is in the output print 'final shape', final.shape # write merged,augmented dataframe print 'writing final dataframe to csv file' final.to_csv(control.path_out_transactions) # write out all the column names print 'all column names in final dataframe' for name in final.columns: print name if '_y' in name: print 'found strange suffix' pdb.set_trace() print control if control.test: print 'DISCARD OUTPUT: test' print 'done' return
def main(argv): control = make_control(argv) sys.stdout = Logger(base_name=control.arg.base_name) print control # NOTE: Organize the computation to minimize memory usage # so that this code can run on smaller-memory systems def ps(name, value): s = value.shape print ' %20s shape (%d, %d)' % (name, s[0], s[1]) # create dataframes n_read_if_test = 10000 deeds_g_al = deeds.read_g_al( control.path, n_read_if_test if control.test else None, ) parcels_sfr = parcels.read( control.path, 10000 if control.test else None, just_sfr=True, ) ps('original deeds g al', deeds_g_al) ps('original parcels sfr', parcels_sfr) # augment parcels to include a zip5 field (5-digit zip code) # drop samples without a zipcode # rationale: we use the zip5 to join the features derived from parcels # and zip5 is derived from zipcode zipcode_present = parcels_sfr[parcels.zipcode].notnull() parcels_sfr = parcels_sfr[zipcode_present] parcels.add_zip5(parcels_sfr) # augment parcels and deeds to include a better APN print 'adding best apn column for parcels' new_column_parcels = best_apn(parcels_sfr, parcels.apn_formatted, parcels.apn_unformatted) parcels_sfr.loc[:, parcels.best_apn] = new_column_parcels # generates an ignorable warning print 'adding best apn column for deeds' new_column_deeds = best_apn(deeds_g_al, deeds.apn_formatted, deeds.apn_unformatted) deeds_g_al.loc[:, deeds.best_apn] = new_column_deeds ps('revised deeds_g_al', deeds_g_al) ps('revised parcels_sfr', parcels_sfr) # join the deeds and parcels files print 'starting to merge' check_feature_names(deeds_g_al) check_feature_names(parcels_sfr) m1 = deeds_g_al.merge(parcels_sfr, how='inner', left_on=deeds.best_apn, right_on=parcels.best_apn, suffixes=('_deed', '_parcel')) check_feature_names(m1) del deeds_g_al del parcels_sfr ps('m1 merge deed + parcels', m1) # add in derived parcels features m2 = parcels_derived_features(control, m1) check_feature_names(m2) ps('ms added parcels_derived', m2) del m1 # add in census data census_features_df = read_census_features(control) m3 = m2.merge(census_features_df, left_on=transactions.census_tract, right_on="census_tract", ) assert 'census_tract_x' in m3.columns assert 'census_tract_y' in m3.columns assert (m3.census_tract_x == m3.census_tract_y).all() assert 'census_tract' not in m3.columns pu.df_remove_column(m3, 'census_tract_x') pu.df_rename_column(m3, 'census_tract_y', 'census_tract') check_feature_names(m3) del m2 ps('m3 merged census features', m3) # add in GPS coordinates geocoding_df = read_geocoding(control) m4 = m3.merge(geocoding_df, left_on="best_apn", right_on="G APN", ) del geocoding_df del m3 ps('m4 merged geocoding', m4) final = m4 print 'final columns' for c in final.columns: print c, print cc('fraction', final) # verify that fraction_owner_occupied is in the output print 'final shape', final.shape # write merged,augmented dataframe print 'writing final dataframe to csv file' final.to_csv(control.path_out_transactions) # write out all the column names print 'all column names in final dataframe' for name in final.columns: print name if '_y' in name: print 'found strange suffix' pdb.set_trace() print control if control.test: print 'DISCARD OUTPUT: test' print 'done' return