예제 #1
0
def state_yelp_reviews(df, d, tag):
    ids_ = df.business_id.values.tolist()
    R = get_yelp_reviews_afterdate(ids_, d)
    print R.info()
    
    save_to_pickle(R, '../data/yelp/yelp_reviews_%s.pkl' % tag)
    
    return R
예제 #2
0
    H = pd.read_csv('../data/vegas/restaurant_establishments.csv', delimiter=';', header=None, skiprows=1)
    H.columns = ['permit_number', 'facility_id', 'PE', 'restaurant_name',  'location_name',
                 'address', 'latitude', 'longitude', 'city_id', 
                 'city_name', 'zip_code', 'nciaa', 'plan_review', 'record_status',
                 'current_grade', 'current_demerits', 'date_current', 'previous_grade', 
                 'date_previous', 'misc','empty']
    
    H['id_'] = H.facility_id
    H['name_'] = H.restaurant_name.apply(lib.standard_name)
    H['city_'] = H.city_name.apply(lambda x: '_'.join(x.lower().strip().split()))
    H['zip'] = H.zip_code.apply(lambda x: np.nan if type(x)==float else x[:5])

    H = pd.concat([H, H.address.apply(split_address)], axis=1)
    return H


if __name__ == '__main__':
    B_NV = lib.preprocess_yelp(lib.get_yelp_businesses(), 'NV')
    NV = preprocess_NV()

    merge_level1 = lib.merge_exact_match(NV, B_NV)
    merge_level2 = lib.merge_partial_match(NV, B_NV, merge_level1, merge_level=2, dump_tag='vegas_32')
    # merge_level3 = lib.merge_partial_match(NV, B_NV, merge_level2, merge_level=3, dump_tag='vegas_33')
    # merge_level4 = lib.merge_partial_match(NV, B_NV, merge_level3, merge_level=4, dump_tag='vegas_34')


    merge_level2.to_csv('../data/vegas/merge_dump_22.csv', encoding='utf-8')
    #merge_level3.to_csv('../data/vegas/merge_dump_23.csv', encoding='utf-8')

    lib.save_to_pickle(merge_level1, '../data/vegas/vegas_yelp_merge.pkl')
예제 #3
0
def preprocess_WI():
    H = lib.open_pickle('../data/mad/mad_health_FINAL.pkl')
    
    H['name_'] = H.name.apply(lib.standard_name)
    H['id_'] = H.index

    H_address = H.address.apply(lambda x: lib.parse_address(x,ampersand=True))
    col = H_address.columns.values
    col[0] = 'city_'
    H_address.columns = col

    H = pd.concat([H, H_address], axis=1)
    return H


if __name__ == '__main__':
    B_WI = lib.preprocess_yelp(lib.get_yelp_businesses(), 'WI', ampersand=True)
    WI = preprocess_WI()

    merge_level1 = lib.merge_exact_match(WI, B_WI)
    merge_level2 = lib.merge_partial_match2(WI, B_WI, merge_level1, merge_level=2, dump_tag='mad_32')
    merge_level3 = lib.merge_partial_match2(WI, B_WI, merge_level2, merge_level=3, dump_tag='mad_33')
    merge_level4 = lib.merge_partial_match2(WI, B_WI, merge_level3, merge_level=4, dump_tag='mad_34')


    # merge_level2.to_csv('../data/mad/merge_dump_22.csv', encoding='utf-8')
    # merge_level3.to_csv('../data/mad/merge_dump_23.csv', encoding='utf-8')

    lib.save_to_pickle(merge_level4, '../data/mad/madison_yelp_merge.pkl')
예제 #4
0
        address[key] = re.sub(r"\s+", " ", value).strip()

    return pd.Series(address)


def preprocess_NC():
    H = lib.open_pickle("../data/char/char_FULL_04.pkl")
    H = pd.DataFrame.from_records(H).T

    H["city_"] = H.city.apply(lambda x: x.lower().strip())
    H["name_"] = H.name.apply(lib.standard_name)
    H["id_"] = H.index

    H = pd.concat([H, H.address.apply(split_address)], axis=1)
    return H


if __name__ == "__main__":
    B_NC = lib.preprocess_yelp(lib.get_yelp_businesses(), "NC")
    NC = preprocess_NC()

    merge_level1 = lib.merge_exact_match(NC, B_NC)
    merge_level2 = lib.merge_partial_match2(NC, B_NC, merge_level1, merge_level=2, dump_tag="char_32")
    merge_level3 = lib.merge_partial_match2(NC, B_NC, merge_level2, merge_level=3, dump_tag="char_33")
    merge_level4 = lib.merge_partial_match2(NC, B_NC, merge_level3, merge_level=4, dump_tag="char_34")

    merge_level2.to_csv("../data/char/merge_dump_22.csv", encoding="utf-8")
    merge_level3.to_csv("../data/char/merge_dump_23.csv", encoding="utf-8")

    lib.save_to_pickle(merge_level4, "../data/char/charlotte_yelp_merge.pkl")
예제 #5
0
import pickle
import re
import string
import merge_main as lib
pd.options.mode.chained_assignment = None  # default='warn'

def preprocess_AZ():
    AZ = lib.open_pickle('../data/phx/phoenix_R_full.pkl')
    H_address = AZ.address.apply(lib.parse_address)
    H_address.set_index(AZ.index, inplace=True)
    col = H_address.columns.values
    col[0] = 'city_'
    H_address.columns = col

    H_AZ = pd.concat([AZ, H_address], axis=1)
    H_AZ['name_'] = H_AZ.name.apply(lib.standard_name)
    H_AZ['id_'] = H_AZ.permit_id

    return H_AZ

if __name__ == '__main__':
    B_AZ = lib.preprocess_yelp(lib.get_yelp_businesses(), 'AZ')
    AZ = preprocess_AZ()

    merge_level1 = lib.merge_exact_match(AZ, B_AZ)
    merge_level2 = lib.merge_partial_match2(AZ, B_AZ, merge_level1, merge_level=2, dump_tag='phx_32')
    merge_level3 = lib.merge_partial_match2(AZ, B_AZ, merge_level2, merge_level=3, dump_tag='phx_33')
    merge_level4 = lib.merge_partial_match2(AZ, B_AZ, merge_level3, merge_level=4, dump_tag='phx_34')

    lib.save_to_pickle(merge_level4, '../data/phx/phoenix_yelp_merge.pkl')
예제 #6
0
    
    PA['name_'] = PA.name.apply(lib.standard_name)
    PA['city_'] = PA.city.apply(lambda x: x.lower())
    PA['id_'] = PA.client_id

    PA = pd.concat([PA, PA.address.apply(split_address)], axis=1)
    return PA


if __name__ == '__main__':
    B_PA = lib.preprocess_yelp(lib.get_yelp_businesses(), 'PA')
    PA = preprocess_PA()

    merge_level1 = lib.merge_exact_match(PA, B_PA)
    merge_level2 = lib.merge_partial_match2(PA, B_PA, merge_level1, merge_level=2, dump_tag='pitt_32')
    merge_level3 = lib.merge_partial_match2(PA, B_PA, merge_level2, merge_level=3, dump_tag='pitt_33')
    merge_level4 = lib.merge_partial_match2(PA, B_PA, merge_level3, merge_level=4, dump_tag='pitt_34')


    # merge_level2.to_csv('../data/pitt/merge_dump_22.csv', encoding='utf-8')
    # merge_level3.to_csv('../data/pitt/merge_dump_23.csv', encoding='utf-8')
    # merge_level4.to_csv('../data/pitt/merge_dump_24.csv', encoding='utf-8')

    lib.save_to_pickle(merge_level4, '../data/pitt/pittsburgh_yelp_merge.pkl')






예제 #7
0
    df['v_foundation'] = v_count(df, V, 'n_foundation')
    df['sum_foundation'] = v_sum(df, V, 'n_foundation')
    df['v_priority'] = v_count(df, V, 'n_priority')
    df['sum_priority'] = v_sum(df, V, 'n_priority')

    return V, df

def get_features_AZ(df, min_date, city_tag, i_cols, routine=False):
    if 'id_' not in df.columns:
        df['id_'] = df.permit_id
    I = get_AZ_inspections(df, routine=routine)
    V, I = get_AZ_violations(I)
    R = lib.state_yelp_reviews(df, min_date, city_tag)
    y, x = lib.merge_inspec_dates(I, df, R, i_cols)
    X = lib.summarize_reviews(x)
    print X.info()
    return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='inner')


# -----------MAIN-----------------------------
###############################################

if __name__ == '__main__':
    AZ = open_pickle('../data/phx/phoenix_yelp_merge.pkl')
    df_AZ = get_features_AZ(AZ, '2012-04-01', 'phoenix', 
                            ['n_priority', 'grade', 'purpose', 'n_violations', 'v_core', 'sum_core',
                             'v_foundation', 'sum_foundation', 'v_priority', 'sum_priority'], 
                            routine=True)
    save_to_pickle(df_AZ, '../data/phx/phoenix_yelp_features.pkl')