Пример #1
0
def preprocess_PA():
    PA = lib.open_pickle('../data/pitt/pitt_inspections_FINAL.pkl')
    
    PA['name_'] = PA.name.apply(lib.standard_name)
    PA['city_'] = PA.city.apply(lambda x: x.lower())
    PA['id_'] = PA.client_id

    PA = pd.concat([PA, PA.address.apply(split_address)], axis=1)
    return PA
Пример #2
0
def unpack_data(fname="../data/phx/model_data_phx.pkl"):
    data = open_pickle(fname)
    df_AZ = data["df_AZ"]
    tfs = data["tfs"]
    tfs_vocab = data["tfidf_vocab"]
    tfs_h = data["tfs_h"]
    tfs_h_vocab = data["tfidf_h"]
    A_labels = data["labels"]
    A_vocab = data["vocab"]
    return df_AZ, tfs, tfs_vocab, tfs_h, tfs_h_vocab, A_labels, A_vocab
Пример #3
0
def preprocess_NC():
    H = lib.open_pickle("../data/char/char_FULL_04.pkl")
    H = pd.DataFrame.from_records(H).T

    H["city_"] = H.city.apply(lambda x: x.lower().strip())
    H["name_"] = H.name.apply(lib.standard_name)
    H["id_"] = H.index

    H = pd.concat([H, H.address.apply(split_address)], axis=1)
    return H
Пример #4
0
def preprocess_WI():
    H = lib.open_pickle('../data/mad/mad_health_FINAL.pkl')
    
    H['name_'] = H.name.apply(lib.standard_name)
    H['id_'] = H.index

    H_address = H.address.apply(lambda x: lib.parse_address(x,ampersand=True))
    col = H_address.columns.values
    col[0] = 'city_'
    H_address.columns = col

    H = pd.concat([H, H_address], axis=1)
    return H
Пример #5
0
def preprocess_AZ():
    AZ = lib.open_pickle('../data/phx/phoenix_R_full.pkl')
    H_address = AZ.address.apply(lib.parse_address)
    H_address.set_index(AZ.index, inplace=True)
    col = H_address.columns.values
    col[0] = 'city_'
    H_address.columns = col

    H_AZ = pd.concat([AZ, H_address], axis=1)
    H_AZ['name_'] = H_AZ.name.apply(lib.standard_name)
    H_AZ['id_'] = H_AZ.permit_id

    return H_AZ
Пример #6
0
def get_AZ_inspections(df, routine=False, drop_flag=True):
    I = open_pickle('../data/phx/phoenix_I_full.pkl')

    if routine:
        I = I[I.purpose == "Routine Inspection"]

    I[I.n_priority == 'NA'].n_priority = -1
    I['id_'] = I.permit_id

    I = I[I.permit_id.isin(df.permit_id.unique())]
    if drop_flag:
        I.drop_duplicates(inplace=True) 

    return I
Пример #7
0
def get_AZ_violations(df):
    V = open_pickle('../data/phx/phoenix_V_full.pkl')
    V['id_'] = V.permit_id

    V['type'] = V.comments.apply(viol_type)
    V = pd.concat([V, V.comments.apply(count_viol_type).set_index(V.index)], axis=1)
    
    df.index = df.inspec_id

    df['n_violations'] = V.groupby(['id_','inspec_id']).count().reset_index(level=0)['code']
    df['v_core'] = v_count(df, V, 'n_core')
    df['sum_core'] = v_sum(df, V, 'n_core')
    df['v_foundation'] = v_count(df, V, 'n_foundation')
    df['sum_foundation'] = v_sum(df, V, 'n_foundation')
    df['v_priority'] = v_count(df, V, 'n_priority')
    df['sum_priority'] = v_sum(df, V, 'n_priority')

    return V, df
Пример #8
0
def get_tf_pickled():
	return open_pickle('../data/yelp/yelp_tf_reviews_phx.pkl')
Пример #9
0
def get_reviews():
	return open_pickle('../data/yelp/yelp_reviews_phoenix.pkl')
Пример #10
0
    df['v_foundation'] = v_count(df, V, 'n_foundation')
    df['sum_foundation'] = v_sum(df, V, 'n_foundation')
    df['v_priority'] = v_count(df, V, 'n_priority')
    df['sum_priority'] = v_sum(df, V, 'n_priority')

    return V, df

def get_features_AZ(df, min_date, city_tag, i_cols, routine=False):
    if 'id_' not in df.columns:
        df['id_'] = df.permit_id
    I = get_AZ_inspections(df, routine=routine)
    V, I = get_AZ_violations(I)
    R = lib.state_yelp_reviews(df, min_date, city_tag)
    y, x = lib.merge_inspec_dates(I, df, R, i_cols)
    X = lib.summarize_reviews(x)
    print X.info()
    return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='inner')


# -----------MAIN-----------------------------
###############################################

if __name__ == '__main__':
    AZ = open_pickle('../data/phx/phoenix_yelp_merge.pkl')
    df_AZ = get_features_AZ(AZ, '2012-04-01', 'phoenix', 
                            ['n_priority', 'grade', 'purpose', 'n_violations', 'v_core', 'sum_core',
                             'v_foundation', 'sum_foundation', 'v_priority', 'sum_priority'], 
                            routine=True)
    save_to_pickle(df_AZ, '../data/phx/phoenix_yelp_features.pkl')