Пример #1
0
	I = []
	for inspec_item, id_ in zip(inspec.values.flatten().tolist(), df.id_.values.flatten().tolist()):
	    temp = pd.DataFrame(inspec_item, columns=['date','score','grade','inspector'])
	    temp['id_'] = id_
	    temp['inspec_id'] = pd.Series(zip(temp.id_,temp.index)).apply(lambda x: '%s_%s' % (x[0],x[1]))
	    I.append(temp)
	    
	I = pd.concat(I, axis=0).reset_index(drop=True)
	if drop_flag:
		I.drop_duplicates(inplace=True)

	return I

def get_features_NC(df, min_date, city_tag, i_cols):
	I = get_NC_inspections(df)
	R = lib.state_yelp_reviews(df, min_date, city_tag)
	y, x = lib.merge_inspec_dates(I, df, R, i_cols)
	print y.info()
	X = lib.summarize_reviews(x)
	return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='outer')


# -----------MAIN-----------------------------
###############################################

if __name__ == '__main__':
	NC = open_pickle('../data/char/charlotte_yelp_merge.pkl')
	df_NC = get_features_NC(NC, '2011-06-30', 'charlotte', ['score','grade'])
	save_to_pickle(df_NC, '../data/char/charlotte_yelp_features.pkl')

Пример #2
0
    I.violations.fillna('', inplace=True)

    I['n_violations'] = I.violations.apply(lambda x: len(x.split(',')) if len(x) > 0 else 0)
    I['id_'] = I.permit_number
    I['inspec_id'] = I.serial_number

    I = I[I.permit_number.isin(df.permit_number.unique())]
    if drop_flag:
        I.drop_duplicates(inplace=True) 

    return I[I.permit_number.isin(df.permit_number.unique())]

def get_features_NV(df, min_date, city_tag, i_cols):
    if 'id_' not in df.columns:
        df['id_'] = df.permit_number
    I = get_NV_inspections(df)
    R = lib.state_yelp_reviews(df, min_date, city_tag)
    y, x = lib.merge_inspec_dates(I, df, R, i_cols)
    X = lib.summarize_reviews(x)
    return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='inner')


# -----------MAIN-----------------------------
###############################################

if __name__ == '__main__':
    NV = open_pickle('../data/vegas/vegas_yelp_merge.pkl')
    df_NV = get_features_NV(NV, '1989-07-01', 'vegas', ['demerits','grade', 'n_violations'])
    save_to_pickle(df_NV, '../data/vegas/vegas_yelp_features.pkl')

Пример #3
0
	    
	V = pd.concat(V, axis=0)
	V['cdc risk factor'].fillna('', inplace=True)

	V['critical'] = (~V['cdc risk factor'].isin(['','good retail practice'])).astype(int)
	df['n_violations'] = V.groupby(['id_','inspec_id']).count().reset_index(level=0).critical
	df['n_critical'] = V.groupby(['id_','inspec_id']).sum().reset_index(level=0).critical

	df['n_critical'].fillna(0, inplace=True)
	df['n_violations'].fillna(0, inplace=True)

	return V, df

def get_features_WI(df, min_date, city_tag, i_cols):
	I = get_WI_inspections(df)
	V, I = get_WI_violations(I)
	R = lib.state_yelp_reviews(df, min_date, city_tag)
	y, x = lib.merge_inspec_dates(I, df, R, i_cols)
	X = lib.summarize_reviews(x)
	return pd.merge(y, X, left_on=['inspec_id','business_id','id_'], right_index=True, how='inner')


# -----------MAIN-----------------------------
###############################################

if __name__ == '__main__':
	WI = open_pickle('../data/mad/madison_yelp_merge.pkl')
	df_WI = get_features_WI(WI, '2011-06-30', 'madison', ['n_critical', 'n_violations'])
	save_to_pickle(df_WI, '../data/mad/madison_yelp_features.pkl')