H = pd.read_csv('../data/vegas/restaurant_establishments.csv', delimiter=';', header=None, skiprows=1) H.columns = ['permit_number', 'facility_id', 'PE', 'restaurant_name', 'location_name', 'address', 'latitude', 'longitude', 'city_id', 'city_name', 'zip_code', 'nciaa', 'plan_review', 'record_status', 'current_grade', 'current_demerits', 'date_current', 'previous_grade', 'date_previous', 'misc','empty'] H['id_'] = H.facility_id H['name_'] = H.restaurant_name.apply(lib.standard_name) H['city_'] = H.city_name.apply(lambda x: '_'.join(x.lower().strip().split())) H['zip'] = H.zip_code.apply(lambda x: np.nan if type(x)==float else x[:5]) H = pd.concat([H, H.address.apply(split_address)], axis=1) return H if __name__ == '__main__': B_NV = lib.preprocess_yelp(lib.get_yelp_businesses(), 'NV') NV = preprocess_NV() merge_level1 = lib.merge_exact_match(NV, B_NV) merge_level2 = lib.merge_partial_match(NV, B_NV, merge_level1, merge_level=2, dump_tag='vegas_32') # merge_level3 = lib.merge_partial_match(NV, B_NV, merge_level2, merge_level=3, dump_tag='vegas_33') # merge_level4 = lib.merge_partial_match(NV, B_NV, merge_level3, merge_level=4, dump_tag='vegas_34') merge_level2.to_csv('../data/vegas/merge_dump_22.csv', encoding='utf-8') #merge_level3.to_csv('../data/vegas/merge_dump_23.csv', encoding='utf-8') lib.save_to_pickle(merge_level1, '../data/vegas/vegas_yelp_merge.pkl')
def preprocess_WI(): H = lib.open_pickle('../data/mad/mad_health_FINAL.pkl') H['name_'] = H.name.apply(lib.standard_name) H['id_'] = H.index H_address = H.address.apply(lambda x: lib.parse_address(x,ampersand=True)) col = H_address.columns.values col[0] = 'city_' H_address.columns = col H = pd.concat([H, H_address], axis=1) return H if __name__ == '__main__': B_WI = lib.preprocess_yelp(lib.get_yelp_businesses(), 'WI', ampersand=True) WI = preprocess_WI() merge_level1 = lib.merge_exact_match(WI, B_WI) merge_level2 = lib.merge_partial_match2(WI, B_WI, merge_level1, merge_level=2, dump_tag='mad_32') merge_level3 = lib.merge_partial_match2(WI, B_WI, merge_level2, merge_level=3, dump_tag='mad_33') merge_level4 = lib.merge_partial_match2(WI, B_WI, merge_level3, merge_level=4, dump_tag='mad_34') # merge_level2.to_csv('../data/mad/merge_dump_22.csv', encoding='utf-8') # merge_level3.to_csv('../data/mad/merge_dump_23.csv', encoding='utf-8') lib.save_to_pickle(merge_level4, '../data/mad/madison_yelp_merge.pkl')
import numpy as np import pickle import re import string import merge_main as lib pd.options.mode.chained_assignment = None # default='warn' def preprocess_AZ(): AZ = lib.open_pickle('../data/phx/phoenix_R_full.pkl') H_address = AZ.address.apply(lib.parse_address) H_address.set_index(AZ.index, inplace=True) col = H_address.columns.values col[0] = 'city_' H_address.columns = col H_AZ = pd.concat([AZ, H_address], axis=1) H_AZ['name_'] = H_AZ.name.apply(lib.standard_name) H_AZ['id_'] = H_AZ.permit_id return H_AZ if __name__ == '__main__': B_AZ = lib.preprocess_yelp(lib.get_yelp_businesses(), 'AZ') AZ = preprocess_AZ() merge_level1 = lib.merge_exact_match(AZ, B_AZ) merge_level2 = lib.merge_partial_match2(AZ, B_AZ, merge_level1, merge_level=2, dump_tag='phx_32') merge_level3 = lib.merge_partial_match2(AZ, B_AZ, merge_level2, merge_level=3, dump_tag='phx_33') merge_level4 = lib.merge_partial_match2(AZ, B_AZ, merge_level3, merge_level=4, dump_tag='phx_34') lib.save_to_pickle(merge_level4, '../data/phx/phoenix_yelp_merge.pkl')
address[key] = re.sub(r"\s+", " ", value).strip() return pd.Series(address) def preprocess_NC(): H = lib.open_pickle("../data/char/char_FULL_04.pkl") H = pd.DataFrame.from_records(H).T H["city_"] = H.city.apply(lambda x: x.lower().strip()) H["name_"] = H.name.apply(lib.standard_name) H["id_"] = H.index H = pd.concat([H, H.address.apply(split_address)], axis=1) return H if __name__ == "__main__": B_NC = lib.preprocess_yelp(lib.get_yelp_businesses(), "NC") NC = preprocess_NC() merge_level1 = lib.merge_exact_match(NC, B_NC) merge_level2 = lib.merge_partial_match2(NC, B_NC, merge_level1, merge_level=2, dump_tag="char_32") merge_level3 = lib.merge_partial_match2(NC, B_NC, merge_level2, merge_level=3, dump_tag="char_33") merge_level4 = lib.merge_partial_match2(NC, B_NC, merge_level3, merge_level=4, dump_tag="char_34") merge_level2.to_csv("../data/char/merge_dump_22.csv", encoding="utf-8") merge_level3.to_csv("../data/char/merge_dump_23.csv", encoding="utf-8") lib.save_to_pickle(merge_level4, "../data/char/charlotte_yelp_merge.pkl")
return pd.Series(address) def preprocess_PA(): PA = lib.open_pickle('../data/pitt/pitt_inspections_FINAL.pkl') PA['name_'] = PA.name.apply(lib.standard_name) PA['city_'] = PA.city.apply(lambda x: x.lower()) PA['id_'] = PA.client_id PA = pd.concat([PA, PA.address.apply(split_address)], axis=1) return PA if __name__ == '__main__': B_PA = lib.preprocess_yelp(lib.get_yelp_businesses(), 'PA') PA = preprocess_PA() merge_level1 = lib.merge_exact_match(PA, B_PA) merge_level2 = lib.merge_partial_match2(PA, B_PA, merge_level1, merge_level=2, dump_tag='pitt_32') merge_level3 = lib.merge_partial_match2(PA, B_PA, merge_level2, merge_level=3, dump_tag='pitt_33') merge_level4 = lib.merge_partial_match2(PA, B_PA, merge_level3, merge_level=4, dump_tag='pitt_34') # merge_level2.to_csv('../data/pitt/merge_dump_22.csv', encoding='utf-8') # merge_level3.to_csv('../data/pitt/merge_dump_23.csv', encoding='utf-8') # merge_level4.to_csv('../data/pitt/merge_dump_24.csv', encoding='utf-8') lib.save_to_pickle(merge_level4, '../data/pitt/pittsburgh_yelp_merge.pkl')