示例#1
0
    H = pd.read_csv('../data/vegas/restaurant_establishments.csv', delimiter=';', header=None, skiprows=1)
    H.columns = ['permit_number', 'facility_id', 'PE', 'restaurant_name',  'location_name',
                 'address', 'latitude', 'longitude', 'city_id', 
                 'city_name', 'zip_code', 'nciaa', 'plan_review', 'record_status',
                 'current_grade', 'current_demerits', 'date_current', 'previous_grade', 
                 'date_previous', 'misc','empty']
    
    H['id_'] = H.facility_id
    H['name_'] = H.restaurant_name.apply(lib.standard_name)
    H['city_'] = H.city_name.apply(lambda x: '_'.join(x.lower().strip().split()))
    H['zip'] = H.zip_code.apply(lambda x: np.nan if type(x)==float else x[:5])

    H = pd.concat([H, H.address.apply(split_address)], axis=1)
    return H


if __name__ == '__main__':
    B_NV = lib.preprocess_yelp(lib.get_yelp_businesses(), 'NV')
    NV = preprocess_NV()

    merge_level1 = lib.merge_exact_match(NV, B_NV)
    merge_level2 = lib.merge_partial_match(NV, B_NV, merge_level1, merge_level=2, dump_tag='vegas_32')
    # merge_level3 = lib.merge_partial_match(NV, B_NV, merge_level2, merge_level=3, dump_tag='vegas_33')
    # merge_level4 = lib.merge_partial_match(NV, B_NV, merge_level3, merge_level=4, dump_tag='vegas_34')


    merge_level2.to_csv('../data/vegas/merge_dump_22.csv', encoding='utf-8')
    #merge_level3.to_csv('../data/vegas/merge_dump_23.csv', encoding='utf-8')

    lib.save_to_pickle(merge_level1, '../data/vegas/vegas_yelp_merge.pkl')
示例#2
0
def preprocess_WI():
    H = lib.open_pickle('../data/mad/mad_health_FINAL.pkl')
    
    H['name_'] = H.name.apply(lib.standard_name)
    H['id_'] = H.index

    H_address = H.address.apply(lambda x: lib.parse_address(x,ampersand=True))
    col = H_address.columns.values
    col[0] = 'city_'
    H_address.columns = col

    H = pd.concat([H, H_address], axis=1)
    return H


if __name__ == '__main__':
    B_WI = lib.preprocess_yelp(lib.get_yelp_businesses(), 'WI', ampersand=True)
    WI = preprocess_WI()

    merge_level1 = lib.merge_exact_match(WI, B_WI)
    merge_level2 = lib.merge_partial_match2(WI, B_WI, merge_level1, merge_level=2, dump_tag='mad_32')
    merge_level3 = lib.merge_partial_match2(WI, B_WI, merge_level2, merge_level=3, dump_tag='mad_33')
    merge_level4 = lib.merge_partial_match2(WI, B_WI, merge_level3, merge_level=4, dump_tag='mad_34')


    # merge_level2.to_csv('../data/mad/merge_dump_22.csv', encoding='utf-8')
    # merge_level3.to_csv('../data/mad/merge_dump_23.csv', encoding='utf-8')

    lib.save_to_pickle(merge_level4, '../data/mad/madison_yelp_merge.pkl')
示例#3
0
import numpy as np
import pickle
import re
import string
import merge_main as lib
pd.options.mode.chained_assignment = None  # default='warn'

def preprocess_AZ():
    AZ = lib.open_pickle('../data/phx/phoenix_R_full.pkl')
    H_address = AZ.address.apply(lib.parse_address)
    H_address.set_index(AZ.index, inplace=True)
    col = H_address.columns.values
    col[0] = 'city_'
    H_address.columns = col

    H_AZ = pd.concat([AZ, H_address], axis=1)
    H_AZ['name_'] = H_AZ.name.apply(lib.standard_name)
    H_AZ['id_'] = H_AZ.permit_id

    return H_AZ

if __name__ == '__main__':
    B_AZ = lib.preprocess_yelp(lib.get_yelp_businesses(), 'AZ')
    AZ = preprocess_AZ()

    merge_level1 = lib.merge_exact_match(AZ, B_AZ)
    merge_level2 = lib.merge_partial_match2(AZ, B_AZ, merge_level1, merge_level=2, dump_tag='phx_32')
    merge_level3 = lib.merge_partial_match2(AZ, B_AZ, merge_level2, merge_level=3, dump_tag='phx_33')
    merge_level4 = lib.merge_partial_match2(AZ, B_AZ, merge_level3, merge_level=4, dump_tag='phx_34')

    lib.save_to_pickle(merge_level4, '../data/phx/phoenix_yelp_merge.pkl')
示例#4
0
        address[key] = re.sub(r"\s+", " ", value).strip()

    return pd.Series(address)


def preprocess_NC():
    H = lib.open_pickle("../data/char/char_FULL_04.pkl")
    H = pd.DataFrame.from_records(H).T

    H["city_"] = H.city.apply(lambda x: x.lower().strip())
    H["name_"] = H.name.apply(lib.standard_name)
    H["id_"] = H.index

    H = pd.concat([H, H.address.apply(split_address)], axis=1)
    return H


if __name__ == "__main__":
    B_NC = lib.preprocess_yelp(lib.get_yelp_businesses(), "NC")
    NC = preprocess_NC()

    merge_level1 = lib.merge_exact_match(NC, B_NC)
    merge_level2 = lib.merge_partial_match2(NC, B_NC, merge_level1, merge_level=2, dump_tag="char_32")
    merge_level3 = lib.merge_partial_match2(NC, B_NC, merge_level2, merge_level=3, dump_tag="char_33")
    merge_level4 = lib.merge_partial_match2(NC, B_NC, merge_level3, merge_level=4, dump_tag="char_34")

    merge_level2.to_csv("../data/char/merge_dump_22.csv", encoding="utf-8")
    merge_level3.to_csv("../data/char/merge_dump_23.csv", encoding="utf-8")

    lib.save_to_pickle(merge_level4, "../data/char/charlotte_yelp_merge.pkl")
示例#5
0
    return pd.Series(address)


def preprocess_PA():
    PA = lib.open_pickle('../data/pitt/pitt_inspections_FINAL.pkl')
    
    PA['name_'] = PA.name.apply(lib.standard_name)
    PA['city_'] = PA.city.apply(lambda x: x.lower())
    PA['id_'] = PA.client_id

    PA = pd.concat([PA, PA.address.apply(split_address)], axis=1)
    return PA


if __name__ == '__main__':
    B_PA = lib.preprocess_yelp(lib.get_yelp_businesses(), 'PA')
    PA = preprocess_PA()

    merge_level1 = lib.merge_exact_match(PA, B_PA)
    merge_level2 = lib.merge_partial_match2(PA, B_PA, merge_level1, merge_level=2, dump_tag='pitt_32')
    merge_level3 = lib.merge_partial_match2(PA, B_PA, merge_level2, merge_level=3, dump_tag='pitt_33')
    merge_level4 = lib.merge_partial_match2(PA, B_PA, merge_level3, merge_level=4, dump_tag='pitt_34')


    # merge_level2.to_csv('../data/pitt/merge_dump_22.csv', encoding='utf-8')
    # merge_level3.to_csv('../data/pitt/merge_dump_23.csv', encoding='utf-8')
    # merge_level4.to_csv('../data/pitt/merge_dump_24.csv', encoding='utf-8')

    lib.save_to_pickle(merge_level4, '../data/pitt/pittsburgh_yelp_merge.pkl')