def run_sd(): sd_bus = clean_sd_bus() sd_add = pd.read_csv(data_dict['intermediate']['sd']['parcel'] + 'addresses.csv') sd_bus = merge_addresses( sd_bus, sd_add[sd_add['parsed_city'].isin(['san diego', 'la jolla'])], fuzzy=True, nearest_n1=True, fuzzy_threshold=90, n1_threshold=5, add_merge_cols=['parsed_addr_n1', 'parsed_addr_sn', 'parsed_addr_ss'], bus_merge_cols=['primary_cleaned_addr_n1', 'primary_cleaned_addr_sn', 'primary_cleaned_addr_ss'], cols_to_merge=['lat', 'long', 'parcelID', 'parsed_city', 'parsed_addr_zip'] ) sd_bus.to_csv(data_dict['intermediate']['sd']['business_location'] + '/business_location_addresses_merged.csv', index=False) sd_bus = pd.read_csv('/home/jfish/project_data/boring_cities/data/intermediate/sd/business_location/business_location_addresses_merged.csv') # read in census data ca_shp = gpd.read_file(census_data_dict['ca bg shp']) con = sqlite3.connect(dataPrefix + "/data/census.db") census_tracts = pd.read_sql_query("SELECT GEOID10 from acs", con) census_tracts['GEOID10'] = census_tracts['GEOID10'].str.pad(width = 11, side="left", fillchar = '0') # print(census_tracts['GEOID10'].str.len().value_counts()) sd_bus = merge_GEOID(sd_bus, ca_shp, reset=True) business_loc_to_sql(sd_bus, table="business_locations_flat", mode="append") # con = sqlite3.connect(dataPrefix + "/data/business_locations.db") # sd_bus = pd.read_sql_query("SELECT * from business_locations_flat", con) sd_bus = make_business_panel_wrapper(sd_bus, n_cores=4) sd_bus = make_location_vars(sd_bus) sd_bus.to_csv(data_dict['final']['sd']['business_location'] + '/business_location_panel.csv', index=False) # con = sqlite3.connect(dataPrefix + "/data/business_locations.db") # panel = pd.read_sql_query("SELECT * from business_locations_panel WHERE ", con) business_loc_to_sql(sd_bus, table="business_locations_panel", mode="append")
def run_long_beach(): print("running long_beach") long_beach_bus = clean_long_beach_bus() # long_beach_bus = pd.read_csv(data_dict['intermediate']['long_beach']['business_location'] + '/business_location.csv') #df = pd.read_csv('/home/jfish/project_data/boring_cities/data/intermediate/long_beach/business_location/business_location.csv') print(long_beach_bus.shape[0]) long_beach_add = pd.read_csv(data_dict['intermediate']['la']['parcel'] + 'addresses.csv') # long_beach_bus,long_beach_add = misc_chi_cleaning(bus_df=long_beach_bus, # add_df=long_beach_add) print("misc add") long_beach_bus = merge_addresses( long_beach_bus, long_beach_add[long_beach_add['parsed_city']=="long beach"], fuzzy=True, nearest_n1=True, fuzzy_threshold=90, n1_threshold=5, add_merge_cols=['parsed_addr_n1', 'parsed_addr_sn', 'parsed_addr_ss'], bus_merge_cols=['primary_cleaned_addr_n1', 'primary_cleaned_addr_sn', 'primary_cleaned_addr_ss'], cols_to_merge=['lat', 'long', 'parcelID', 'parsed_city', 'parsed_addr_zip'] ) print(long_beach_bus.shape[0]) long_beach_bus.to_csv(data_dict['intermediate']['long_beach']['business_location'] + '/business_location_addresses_merged.csv', index=False) print("merged addresses") #long_beach_bus = pd.read_csv('/home/jfish/project_data/boring_cities/data/intermediate/long_beach/business_location/business_location_addresses_merged.csv') print(len(long_beach_bus)) print(long_beach_bus.shape) if long_beach_bus.shape[0] > 150000: raise ValueError # print(long_beach_bus.shape) # read in census data ca_shp = gpd.read_file(census_data_dict['ca bg shp']) con = sqlite3.connect(dataPrefix + "/data/census.db") census_tracts = pd.read_sql_query("SELECT GEOID10 from acs", con) census_tracts['GEOID10'] = census_tracts['GEOID10'].str.pad(width = 11, side="left", fillchar = '0') # print(census_tracts['GEOID10'].str.len().value_counts()) long_beach_bus = merge_GEOID(long_beach_bus, ca_shp, reset=True) business_loc_to_sql(long_beach_bus, table="business_locations_flat", mode="append") print("exported to int") #con = sqlite3.connect(dataPrefix + "/data/business_locations.db") #long_beach_bus = pd.read_sql_query("SELECT * from business_locations_flat WHERE source = 'long_beach'", con) long_beach_bus = make_business_panel_wrapper(long_beach_bus, n_cores=4) long_beach_bus = long_beach_bus.drop_duplicates(subset=['location_id','year']) long_beach_bus = make_location_vars(long_beach_bus) long_beach_bus.to_csv(data_dict['final']['long_beach']['business_location'] + '/business_location_panel.csv', index=False) # con = sqlite3.connect(dataPrefix + "/data/business_locations.db") # panel = pd.read_sql_query("SELECT * from business_locations_panel WHERE ", con) business_loc_to_sql(long_beach_bus, table="business_locations_panel", mode="append")
def run_chicago(): # print("running chicago") chicago_bus = clean_chicago_bus() print("cleaned businesses") # chicago_bus = pd.read_csv(data_dict['intermediate']['chicago']['business_location'] + '/business_location.csv') chicago_add = pd.read_csv(data_dict['intermediate']['chicago']['parcel'] + 'addresses_concat.csv') chicago_add, chicago_bus = misc_chi_cleaning(bus_df=chicago_bus, add_df=chicago_add) print("misc add") chicago_bus = merge_addresses( chicago_bus, chicago_add, fuzzy=True, nearest_n1=True, fuzzy_threshold=90, n1_threshold=5, add_merge_cols=['parsed_addr_n1', 'parsed_addr_sn', 'parsed_addr_ss'], bus_merge_cols=['primary_cleaned_addr_n1', 'primary_cleaned_addr_sn', 'primary_cleaned_addr_ss'], cols_to_merge=['lat', 'long', 'parcelID', 'parsed_city', 'parsed_addr_zip'] ) chicago_bus.to_csv(data_dict['intermediate']['chicago']['business_location'] + '/business_location_addresses_merged.csv', index=False) print("merged addresses") # chicago_bus = pd.read_csv('/home/jfish/project_data/boring_cities/data/intermediate/chicago/business_location/business_location_addresses_merged.csv') # if chicago_bus.shape[0] > 300000: # raise ValueError # print(chicago_bus.shape) # read in census data il_shp = gpd.read_file(census_data_dict['il bg shp']) con = sqlite3.connect(dataPrefix + "/data/census.db") census_tracts = pd.read_sql_query("SELECT GEOID10 from acs", con) census_tracts['GEOID10'] = census_tracts['GEOID10'].str.pad(width = 11, side="left", fillchar = '0') # print(census_tracts['GEOID10'].str.len().value_counts()) chicago_bus['long_from_address'] = chicago_bus['long_from_address'].abs() chicago_bus['lat_from_address'] = chicago_bus['lat_from_address'].abs() chicago_bus = merge_GEOID(chicago_bus, il_shp, reset=True) business_loc_to_sql(chicago_bus, table="business_locations_flat", mode="append") print("exported to int") #con = sqlite3.connect(dataPrefix + "/data/business_locations.db") #chicago_bus = pd.read_sql_query("SELECT * from business_locations_flat WHERE source = 'chicago'", con) chicago_bus = make_business_panel_wrapper(chicago_bus, n_cores=4) chicago_bus = chicago_bus.drop_duplicates(subset=['location_id','year']) chicago_bus = make_location_vars(chicago_bus) chicago_bus.to_csv(data_dict['final']['chicago']['business_location'] + '/business_location_panel.csv', index=False) # con = sqlite3.connect(dataPrefix + "/data/business_locations.db") # panel = pd.read_sql_query("SELECT * from business_locations_panel WHERE ", con) business_loc_to_sql(chicago_bus, table="business_locations_panel", mode="append")
def run_sac(): sac_bus = pd.read_csv(data_dict['intermediate']['sac']['business_location'] + 'business_location.csv') sac_bus = clean_sac_bus() sac_add = pd.read_csv(data_dict['raw']['sac']['parcel'] + 'Address.csv') sac_xwalk = pd.read_csv(data_dict['raw']['sac']['parcel'] + 'Address_parcel_xwalk.csv') sac_add = merge_sac_parcel_id(sac_add=sac_add, xwalk=sac_xwalk) sac_add= clean_sac_add(sac_add) sac_add.to_csv(data_dict['intermediate']['sac']['parcel'] + 'addresses_concat.csv', index=False) sac_bus = merge_addresses( sac_bus, sac_add, fuzzy=True, nearest_n1=True, fuzzy_threshold=90, n1_threshold=5, add_merge_cols=['parsed_addr_n1', 'parsed_addr_sn', 'parsed_addr_ss'], bus_merge_cols=['primary_cleaned_addr_n1', 'primary_cleaned_addr_sn', 'primary_cleaned_addr_ss'], cols_to_merge=['lat', 'long', 'parcelID', 'parsed_city', 'parsed_addr_zip'] ) sac_bus.to_csv(data_dict['intermediate']['sac']['business_location'] + '/business_location_addresses_merged.csv', index=False) sac_bus = pd.read_csv(data_dict['intermediate']['sac']['business_location'] + '/business_location_addresses_merged.csv') sac_bus = make_business_vars_wrapper(sac_bus, n_cores=4) sac_bus.to_csv(data_dict['final']['sac']['business_location'] + '/business_location_flat.csv', index=False) sac_bus = pd.read_csv(data_dict['final']['sac']['business_location'] + '/business_location_flat.csv') # read in census data ca_shp = gpd.read_file(census_data_dict['ca bg shp']) con = sqlite3.connect(dataPrefix + "/data/census.db") census_tracts = pd.read_sql_query("SELECT GEOID10 from acs", con) census_tracts['GEOID10'] = census_tracts['GEOID10'].str.pad(width = 11, side="left", fillchar = '0') print(census_tracts['GEOID10'].str.len().value_counts()) sac_bus = merge_GEOID(sac_bus, ca_shp, reset=True) business_loc_to_sql(sac_bus, table="business_locations_flat", mode="replace") # sac_bus = pd.read_csv(data_dict['final']['sac']['business_location'] + '/business_location_flat.csv') #sac_bus = pd.read_csv(data_dict['final']['sac']['business_location'] + '/business_location_panel.csv') sac_bus = make_business_panel_wrapper(sac_bus, n_cores=4) sac_bus = make_location_vars(sac_bus) sac_bus.to_csv(data_dict['final']['sac']['business_location'] + '/business_location_panel.csv', index=False) business_loc_to_sql(sac_bus, table="business_locations_panel", mode="replace") print(sac_bus['CT_ID_10'].str.len().value_counts()) ###sac_bus = merge_census_tract_data(sac_bus, census_tracts, panel=True) print(sum(sac_bus['pop'].isna())) sac_bus.to_csv(data_dict['final']['sac']['business_location'] + '/business_location_panel_census_merge.csv', index = False) sac_bus = None sac_add = None gc.collect()
def run_baton_rouge(): baton_rouge_bus = clean_baton_rouge_bus() baton_rouge_add = pd.read_csv(data_dict['intermediate']['baton_rouge']['parcel'] + 'addresses.csv') print(baton_rouge_add.columns) print(baton_rouge_bus.shape[0], '!!!!') baton_rouge_bus = misc_baton_rouge_cleaning(bus_df=baton_rouge_bus, add_df=baton_rouge_add) baton_rouge_bus = merge_addresses( baton_rouge_bus, baton_rouge_add, fuzzy=True, nearest_n1=True, fuzzy_threshold=90, n1_threshold=5, add_merge_cols=['parsed_addr_n1', 'parsed_addr_sn', 'parsed_addr_ss'], bus_merge_cols=['primary_cleaned_addr_n1', 'primary_cleaned_addr_sn', 'primary_cleaned_addr_ss'], cols_to_merge=['lat', 'long', 'parcelID', 'parsed_city', 'parsed_addr_zip'] ) baton_rouge_bus.to_csv(data_dict['intermediate']['baton_rouge']['business_location'] + '/business_location_addresses_merged.csv', index=False) # baton_rouge_bus = pd.read_csv('/home/jfish/project_data/boring_cities/data/intermediate/baton_rouge/business_location/business_location_addresses_merged.csv') # if baton_rouge_bus.shape[0] > 300000: # raise ValueError # print(baton_rouge_bus.shape) # read in census data la_shp = gpd.read_file(census_data_dict['la bg shp']) con = sqlite3.connect(dataPrefix + "/data/census.db") census_tracts = pd.read_sql_query("SELECT GEOID10 from acs", con) census_tracts['GEOID10'] = census_tracts['GEOID10'].str.pad(width = 11, side="left", fillchar = '0') # print(census_tracts['GEOID10'].str.len().value_counts()) baton_rouge_bus = merge_GEOID(baton_rouge_bus, la_shp, reset=True) business_loc_to_sql(baton_rouge_bus, table="business_locations_flat", mode="append") #con = sqlite3.connect(dataPrefix + "/data/business_locations.db") #baton_rouge_bus = pd.read_sql_query("SELECT * from business_locations_flat WHERE source = 'baton_rouge'", con) baton_rouge_bus = make_business_panel_wrapper(baton_rouge_bus, n_cores=4) baton_rouge_bus = make_location_vars(baton_rouge_bus) baton_rouge_bus.to_csv(data_dict['final']['baton_rouge']['business_location'] + '/business_location_panel.csv', index=False) # con = sqlite3.connect(dataPrefix + "/data/business_locations.db") # panel = pd.read_sql_query("SELECT * from business_locations_panel WHERE ", con) business_loc_to_sql(baton_rouge_bus, table="business_locations_panel", mode="append")