def preprocess_PA(): PA = lib.open_pickle('../data/pitt/pitt_inspections_FINAL.pkl') PA['name_'] = PA.name.apply(lib.standard_name) PA['city_'] = PA.city.apply(lambda x: x.lower()) PA['id_'] = PA.client_id PA = pd.concat([PA, PA.address.apply(split_address)], axis=1) return PA
def unpack_data(fname="../data/phx/model_data_phx.pkl"): data = open_pickle(fname) df_AZ = data["df_AZ"] tfs = data["tfs"] tfs_vocab = data["tfidf_vocab"] tfs_h = data["tfs_h"] tfs_h_vocab = data["tfidf_h"] A_labels = data["labels"] A_vocab = data["vocab"] return df_AZ, tfs, tfs_vocab, tfs_h, tfs_h_vocab, A_labels, A_vocab
def preprocess_NC(): H = lib.open_pickle("../data/char/char_FULL_04.pkl") H = pd.DataFrame.from_records(H).T H["city_"] = H.city.apply(lambda x: x.lower().strip()) H["name_"] = H.name.apply(lib.standard_name) H["id_"] = H.index H = pd.concat([H, H.address.apply(split_address)], axis=1) return H
def preprocess_WI(): H = lib.open_pickle('../data/mad/mad_health_FINAL.pkl') H['name_'] = H.name.apply(lib.standard_name) H['id_'] = H.index H_address = H.address.apply(lambda x: lib.parse_address(x,ampersand=True)) col = H_address.columns.values col[0] = 'city_' H_address.columns = col H = pd.concat([H, H_address], axis=1) return H
def preprocess_AZ(): AZ = lib.open_pickle('../data/phx/phoenix_R_full.pkl') H_address = AZ.address.apply(lib.parse_address) H_address.set_index(AZ.index, inplace=True) col = H_address.columns.values col[0] = 'city_' H_address.columns = col H_AZ = pd.concat([AZ, H_address], axis=1) H_AZ['name_'] = H_AZ.name.apply(lib.standard_name) H_AZ['id_'] = H_AZ.permit_id return H_AZ
def get_AZ_inspections(df, routine=False, drop_flag=True): I = open_pickle('../data/phx/phoenix_I_full.pkl') if routine: I = I[I.purpose == "Routine Inspection"] I[I.n_priority == 'NA'].n_priority = -1 I['id_'] = I.permit_id I = I[I.permit_id.isin(df.permit_id.unique())] if drop_flag: I.drop_duplicates(inplace=True) return I
def get_AZ_violations(df): V = open_pickle('../data/phx/phoenix_V_full.pkl') V['id_'] = V.permit_id V['type'] = V.comments.apply(viol_type) V = pd.concat([V, V.comments.apply(count_viol_type).set_index(V.index)], axis=1) df.index = df.inspec_id df['n_violations'] = V.groupby(['id_','inspec_id']).count().reset_index(level=0)['code'] df['v_core'] = v_count(df, V, 'n_core') df['sum_core'] = v_sum(df, V, 'n_core') df['v_foundation'] = v_count(df, V, 'n_foundation') df['sum_foundation'] = v_sum(df, V, 'n_foundation') df['v_priority'] = v_count(df, V, 'n_priority') df['sum_priority'] = v_sum(df, V, 'n_priority') return V, df
def get_tf_pickled(): return open_pickle('../data/yelp/yelp_tf_reviews_phx.pkl')
def get_reviews(): return open_pickle('../data/yelp/yelp_reviews_phoenix.pkl')
df['v_foundation'] = v_count(df, V, 'n_foundation') df['sum_foundation'] = v_sum(df, V, 'n_foundation') df['v_priority'] = v_count(df, V, 'n_priority') df['sum_priority'] = v_sum(df, V, 'n_priority') return V, df def get_features_AZ(df, min_date, city_tag, i_cols, routine=False): if 'id_' not in df.columns: df['id_'] = df.permit_id I = get_AZ_inspections(df, routine=routine) V, I = get_AZ_violations(I) R = lib.state_yelp_reviews(df, min_date, city_tag) y, x = lib.merge_inspec_dates(I, df, R, i_cols) X = lib.summarize_reviews(x) print X.info() return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='inner') # -----------MAIN----------------------------- ############################################### if __name__ == '__main__': AZ = open_pickle('../data/phx/phoenix_yelp_merge.pkl') df_AZ = get_features_AZ(AZ, '2012-04-01', 'phoenix', ['n_priority', 'grade', 'purpose', 'n_violations', 'v_core', 'sum_core', 'v_foundation', 'sum_foundation', 'v_priority', 'sum_priority'], routine=True) save_to_pickle(df_AZ, '../data/phx/phoenix_yelp_features.pkl')