def NB_coefficients(year=2010): poi_dist = getFourSquarePOIDistribution(useRatio=False) F_taxi = getTaxiFlow(normalization="bydestination") W2 = generate_geographical_SpatialLag_ca() Y = retrieve_crime_count(year=year) C = generate_corina_features() D = C[1] popul = C[1][:,0].reshape(C[1].shape[0],1) Y = np.divide(Y, popul) * 10000 f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 ) mms = MinMaxScaler(copy=False) mms.fit(f) mms.transform(f) header = C[0] + [ 'spatiallag', 'taxiflow'] + \ ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] df = pd.DataFrame(f, columns=header) np.savetxt("Y.csv", Y, delimiter=",") df.to_csv("f.csv", sep=",", index=False) # NB permute nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] ) print nbres ls = nbres.strip().split(" ") coef = [float(e) for e in ls] print coef return coef, header
def leaveOneOut_Input_v4(leaveOut): """ Generate observation matrix and vectors Y, F Those observations are trimed for the leave-one-out evaluation. Therefore, the leaveOut indicates the CA id to be left out, ranging from 1-77 """ des, X = generate_corina_features('ca') X = np.delete(X, leaveOut - 1, 0) popul = X[:, 0].reshape(X.shape[0], 1) pvt = X[:, 2] # poverty index of each CA # poi_cnt = getFourSquareCount(leaveOut) # poi_cnt = np.divide(poi_cnt, popul) * 10000 poi_dist = getFourSquarePOIDistribution(leaveOut) poi_dist = np.divide(poi_dist, popul) * 10000 F_dist = generate_geographical_SpatialLag_ca(leaveOut=leaveOut) F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca', leaveOut=leaveOut) F_taxi = getTaxiFlow(leaveOut=leaveOut) Y = retrieve_crime_count(year=2010, col=['total'], region='ca') Y = np.delete(Y, leaveOut - 1, 0) Y = np.divide(Y, popul) * 10000 F = [] n = Y.size Yd = [] for i in range(n): for j in range(n): if i != j: wij = np.array([ F_dist[i, j], actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i, j], F_taxi[i, j] ]) # fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]), 0) fij = np.concatenate((X[i], wij * Y[j][0]), 0) F.append(fij) Yd.append(Y[i]) F = np.array(F) np.append(F, np.ones((F.shape[0], 1)), axis=1) Yd = np.array(Yd) Yd.resize((Yd.size, 1)) return Yd, F
def explore_POI_dist(): des, X = generate_corina_features('ca') popul = X[:,0].reshape(X.shape[0],1) poi_dist = getFourSquarePOIDistribution() # poi_dist = np.divide(poi_dist, popul) * 10000 avgd = np.sum(poi_dist, axis=0) / poi_dist.shape[0] # plot(avgd) cnt = 0 for row in poi_dist: if cnt % 5 == 0: figure() title('{0} - {1}'.format(cnt, cnt + 4)) plot(row) cnt += 1
def explore_POI_dist(): des, X = generate_corina_features('ca') popul = X[:, 0].reshape(X.shape[0], 1) poi_dist = getFourSquarePOIDistribution() # poi_dist = np.divide(poi_dist, popul) * 10000 avgd = np.sum(poi_dist, axis=0) / poi_dist.shape[0] # plot(avgd) cnt = 0 for row in poi_dist: if cnt % 5 == 0: figure() title('{0} - {1}'.format(cnt, cnt + 4)) plot(row) cnt += 1
def leaveOneOut_Input_v4( leaveOut ): """ Generate observation matrix and vectors Y, F Those observations are trimed for the leave-one-out evaluation. Therefore, the leaveOut indicates the CA id to be left out, ranging from 1-77 """ des, X = generate_corina_features('ca') X = np.delete(X, leaveOut-1, 0) popul = X[:,0].reshape(X.shape[0],1) pvt = X[:,2] # poverty index of each CA # poi_cnt = getFourSquareCount(leaveOut) # poi_cnt = np.divide(poi_cnt, popul) * 10000 poi_dist = getFourSquarePOIDistribution(leaveOut) poi_dist = np.divide(poi_dist, popul) * 10000 F_dist = generate_geographical_SpatialLag_ca( leaveOut=leaveOut ) F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca', leaveOut=leaveOut) F_taxi = getTaxiFlow(leaveOut = leaveOut) Y = retrieve_crime_count(year=2010, col=['total'], region='ca') Y = np.delete(Y, leaveOut-1, 0) Y = np.divide(Y, popul) * 10000 F = [] n = Y.size Yd = [] for i in range(n): for j in range(n): if i != j: wij = np.array( [F_dist[i,j], actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i,j], F_taxi[i,j] ]) # fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]), 0) fij = np.concatenate( (X[i], wij * Y[j][0]), 0) F.append(fij) Yd.append(Y[i]) F = np.array(F) np.append(F, np.ones( (F.shape[0], 1) ), axis=1) Yd = np.array(Yd) Yd.resize( (Yd.size, 1) ) return Yd, F
def correlation_POIdist_crime(): """ we calculate the correlation between POI distribution and crime for each community area(CA). Within each CA, the crime count is number of crime in each tract. The POI count is number of POIs in each tract. """ tracts = Tract.createAllTractObjects() ordkey = sorted(tracts.keys()) CAs = {} for key, val in tracts.items(): if val.CA not in CAs: CAs[val.CA] = [key] else: CAs[val.CA].append(key) Y = retrieve_crime_count(2010, col=['total'], region='tract') poi_dist = getFourSquarePOIDistribution(gridLevel='tract') Pearson = {} for cakey, calist in CAs.items(): crime = [] pois = [] for tractkey in calist: crime.append(Y[tractkey]) pois.append(poi_dist[ordkey.index(tractkey)]) # calculate correlation pois = np.array(pois) crime = np.array(crime) pearson = [] for i in range(pois.shape[1]): r = np.vstack( (pois[:,i], crime) ) pearson.append( np.corrcoef(r)[0,1] ) Pearson[cakey] = np.nan_to_num( pearson ) P = [] for key in range(1, 78): P.append(Pearson[key]) np.savetxt("../R/poi_correlation_ca.csv", P, delimiter=",") return np.array(P)
def extract_raw_samples(year=2010, crime_t=['total'], crime_rate=True): """ Extract all samples with raw labels and features. Return None if the corresponding feature is not selected. This function is called once only to avoid unnecessary disk I/O. Input: year - which year to study crime_t - crime types of interest, e.g. 'total' crime_rate - predict crime_rate or not (count) Output: Y - crime rate / count D - demo feature P - POI feature Tf - taxi flow matrix (count) Gd - geo weight matrix """ # Crime count y_cnt = retrieve_crime_count(year, col = crime_t) # Crime rate / count demo = generate_corina_features() population = demo[1][:,0].reshape(demo[1].shape[0], 1) Y = y_cnt / population * 10000 if crime_rate else y_cnt assert(Y.shape == (N,1)) # Demo features D = demo[1] # POI features P = getFourSquarePOIDistribution(useRatio=False) # Taxi flow matrix Tf = getTaxiFlow(normalization="none") # Geo weight matrix Gd = generate_geographical_SpatialLag_ca() return Y, D, P, Tf, Gd
def extract_raw_samples(year=2010, crime_t=['total'], crime_rate=True): """ Extract all samples with raw labels and features. Return None if the corresponding feature is not selected. This function is called once only to avoid unnecessary disk I/O. Input: year - which year to study crime_t - crime types of interest, e.g. 'total' crime_rate - predict crime_rate or not (count) Output: Y - crime rate / count D - demo feature P - POI feature Tf - taxi flow matrix (count) Gd - geo weight matrix """ # Crime count y_cnt = retrieve_crime_count(year, col = crime_t) # Crime rate / count demo = generate_corina_features() population = demo[1][:,0].reshape(demo[1].shape[0], 1) Y = y_cnt / population * 10000 if crime_rate else y_cnt assert(Y.shape == (77,1)) # Demo features D = demo[1] # POI features P = getFourSquarePOIDistribution(useRatio=False) # Taxi flow matrix Tf = getTaxiFlow(normalization="none") # Geo weight matrix Gd = generate_geographical_SpatialLag_ca() return Y, D, P, Tf, Gd
def generateInput_v4(fout=False): """ Generate complete observation matrix """ des, X = generate_corina_features('ca') pvt = X[:, 2] # poverty index of each CA popul = X[:, 0].reshape(X.shape[0], 1) # poi_cnt = getFourSquareCount() # poi_cnt = np.divide(poi_cnt, popul) * 10000 poi_dist = getFourSquarePOIDistribution() poi_dist = np.divide(poi_dist, popul) * 10000 F_dist = generate_geographical_SpatialLag_ca() F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca') F_taxi = getTaxiFlow() Y = retrieve_crime_count(year=2010, col=['total'], region='ca') Y = np.divide(Y, popul) * 10000 F = [] n = Y.size for i in range(n): for j in range(n): if i != j: wij = np.array([ F_dist[i, j], actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i, j], F_taxi[i, j] ]) # fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]) , 0) fij = np.concatenate((X[i], wij * Y[j, 0]), 0) F.append(fij) F = np.array(F) np.append(F, np.ones((F.shape[0], 1)), axis=1) if fout: np.savetxt('../matlab/F.csv', F, delimiter=',') return Y, F
def correlation_POI_crime(gridLevel='tract', poiRatio=False): """ calculate correlation for different POI category """ Y = retrieve_crime_count(2010, col=['total'], region=gridLevel) h, D = generate_corina_features(region='ca') popul = D[:,0].reshape(D.shape[0],1) poi_dist = getFourSquarePOIDistribution(gridLevel=gridLevel, useRatio=poiRatio) cate_label = ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 'Outdoors & Recreation', 'College & Education', 'Nightlife', 'Professional', 'Shops', 'Event'] if gridLevel == 'tract': tracts = Tract.createAllTractObjects() ordkey = sorted(tracts.keys()) crime = [] pois = [] for tractkey in ordkey: crime.append(Y[tractkey]) pois.append(poi_dist[ordkey.index(tractkey)]) pois = np.array(pois) crime = np.array(crime) for i in range(pois.shape[1]): r = np.vstack( (pois[:,i], crime) ) pcc = np.corrcoef(r)[0,1] print pcc elif gridLevel == 'ca': Y = np.divide(Y, popul) * 10000 Y = Y.reshape( (len(Y),) ) poi_dist = np.transpose(poi_dist) for i in range(poi_dist.shape[0]): poi = np.reshape(poi_dist[i,:], Y.shape ) r, p = pearsonr(poi, Y) print cate_label[i], r, p
def generateInput_v4(fout=False): """ Generate complete observation matrix """ des, X = generate_corina_features('ca') pvt = X[:,2] # poverty index of each CA popul = X[:,0].reshape(X.shape[0],1) # poi_cnt = getFourSquareCount() # poi_cnt = np.divide(poi_cnt, popul) * 10000 poi_dist = getFourSquarePOIDistribution() poi_dist = np.divide(poi_dist, popul) * 10000 F_dist = generate_geographical_SpatialLag_ca() F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca') F_taxi = getTaxiFlow() Y = retrieve_crime_count(year=2010, col=['total'], region='ca') Y = np.divide(Y, popul) * 10000 F = [] n = Y.size for i in range(n): for j in range(n): if i != j: wij = np.array( [F_dist[i,j], actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i,j], F_taxi[i,j] ] ) # fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]) , 0) fij = np.concatenate( (X[i], wij * Y[j,0]) , 0) F.append(fij) F = np.array(F) np.append(F, np.ones( (F.shape[0], 1) ), axis=1) if fout: np.savetxt('../matlab/F.csv', F, delimiter=',') return Y, F
def line_POI_crime(): d = getFourSquarePOIDistribution(gridLevel='ca') y = retrieve_crime_count(2010, col=['total'], region='ca') h, D = generate_corina_features(region='ca') popul = D[:,0].reshape(D.shape[0],1) hd = getFourSquarePOIDistributionHeader() yhat = np.divide(y, popul) * 10000 for i in range(6,8): plt.figure() plt.scatter(d[:,i], y) plt.xlim(0, 1000) plt.xlabel('POI count -- {0} category'.format(hd[i])) plt.ylabel('Crime count') plt.figure() plt.scatter(d[:,i], yhat) plt.xlim(0, 1000) plt.xlabel('POI count -- {0} category'.format(hd[i])) plt.ylabel('Crime rate (per 10,000)')
def permutationTest_accuracy(iters, permute='taxiflow'): """ Evaluate crime rate use full feature set: Corina, spaitallag, taxiflow, POIdist evaluate on 2013 at CA level leave one out permutation permute one feature 1000 times takes roughly 30-40 minutes. The results are dumped as "permute-{feature}.pickle" """ poi_dist = getFourSquarePOIDistribution(useRatio=False) F_taxi = getTaxiFlow(normalization="bydestination") W2 = generate_geographical_SpatialLag_ca() Y = retrieve_crime_count(year=2013) C = generate_corina_features() D = C[1] popul = C[1][:, 0].reshape(C[1].shape[0], 1) Y = np.divide(Y, popul) * 10000 f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) nb_mae = [] nb_mre = [] lr_mae = [] lr_mre = [] for i in range(iters): if permute == 'corina': D = np.random.permutation(D) elif permute == 'spatiallag': yhat = np.random.permutation(Y) f2 = np.dot(W2, yhat) elif permute == 'taxiflow': yhat = np.random.permutation(Y) ftaxi = np.dot(F_taxi, Y) elif permute == 'POIdist': poi_dist = np.random.permutation(poi_dist) f = np.ones(f2.shape) f = np.concatenate((f, D, f2, ftaxi, poi_dist), axis=1) header = ['intercept'] + C[0] + [ 'spatiallag', 'taxiflow'] + \ ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] df = pd.DataFrame(f, columns=header) np.savetxt("Y.csv", Y, delimiter=",") df.to_csv("f.csv", sep=",", index=False) # NB permute nbres = subprocess.check_output(['Rscript', 'nbr_eval.R', 'ca']) ls = nbres.split(' ') nb_mae.append(float(ls[0])) nb_mre.append(float(ls[2])) mae2, mre2 = permutation_Test_LR(Y, f) lr_mae.append(mae2) lr_mre.append(mre2) if i % 10 == 0: print i print '{0} iterations finished.'.format(iters) print pvalue(412.305, lr_mae), pvalue(0.363, lr_mre), \ pvalue(319.86, nb_mae), pvalue(0.281, nb_mre) return nb_mae, nb_mre, lr_mae, lr_mre
def leaveOneOut_evaluation_onChicagoCrimeData(year=2010, features=["all"], crime_t=['total'], flow_type=0, verboseoutput=False, region='ca', weightSocialFlow=True, useRate=True, logFeatures=[]): """ Generate the social lag from previous year use income/race/education of current year """ warnings.warn("The leave one out in nbr_eval.R is unfair") if 'sociallag' in features: W = generate_transition_SocialLag(year, lehd_type=flow_type, region=region, normalization='pair') # add POI distribution and taxi flow poi_dist = getFourSquarePOIDistribution(useRatio=False, gridLevel=region) F_taxi = getTaxiFlow(normalization="bydestination", gridLevel=region) if region == 'ca': W2 = generate_geographical_SpatialLag_ca() Yhat = retrieve_crime_count(year - 1, col=crime_t) # h = retrieve_health_data() # Y = h[0].reshape((77,1)) Y = retrieve_crime_count(year, col=crime_t) C = generate_corina_features() popul = C[1][:, 0].reshape(C[1].shape[0], 1) if 'sociallag' in features: """ use poverty demographics to weight social lag """ wC = 28 # 130.0 if useRate else 32.0 # constant parameter if weightSocialFlow: poverty = C[1][:, 2] for i in range(W.shape[0]): for j in range(W.shape[1]): W[i][j] *= np.exp(-np.abs(poverty[i] - poverty[j]) / wC) # crime count is normalized by the total population as crime rate # here we use the crime count per 10 thousand residents if useRate: Y = np.divide(Y, popul) * 10000 Yhat = np.divide(Yhat, popul) * 10000 elif region == 'tract': W2, tractkey = generate_geographical_SpatialLag() Yhat_map = retrieve_crime_count(year - 1, col=crime_t, region='tract') Yhat = np.array([Yhat_map[k] for k in tractkey]).reshape(len(Yhat_map), 1) Y_map = retrieve_crime_count(year, col=crime_t, region='tract') Y = np.array([Y_map[k] for k in tractkey]).reshape(len(Y_map), 1) C = generate_corina_features(region='tract') C_mtx = [] cnt = 0 for k in tractkey: if k in C[1]: C_mtx.append(C[1][k]) else: cnt += 1 C_mtx.append([0 for i in range(7)]) C = (C[0], np.array(C_mtx)) # at tract level we don't normalize by population, since the tract is # defined as region with around 2000 population if useRate: pass i = retrieve_income_features() e = retrieve_education_features() r = retrieve_race_features() f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) # add intercept columnName = ['intercept'] f = np.ones(f2.shape) lrf = np.copy(f) if "all" in features: f = np.concatenate((f, f1, i[1], e[1], r[1]), axis=1) f = pd.DataFrame(f, columns=['social lag'] + i[0] + e[0] + r[0]) if "sociallag" in features: f1 = np.dot(W, Y) if 'sociallag' in logFeatures: f = np.concatenate((f, np.log(f1)), axis=1) else: f = np.concatenate((f, f1), axis=1) lrf = np.concatenate((f, f1), axis=1) columnName += ['social lag'] if "income" in features: f = np.concatenate((f, i[1]), axis=1) lrf = np.concatenate((f, i[1]), axis=1) columnName += i[0] if "race" in features: f = np.concatenate((f, r[1]), axis=1) lrf = np.concatenate((f, r[1]), axis=1) columnName += r[0] if "education" in features: f = np.concatenate((f, e[1]), axis=1) lrf = np.concatenate((f, e[1]), axis=1) columnName += e[0] if 'corina' in features: f = np.concatenate((f, C[1]), axis=1) lrf = np.concatenate((f, C[1]), axis=1) columnName += C[0] if 'spatiallag' in features: if 'spatiallag' in logFeatures: f = np.concatenate((f, np.log(f2)), axis=1) else: f = np.concatenate((f, f2), axis=1) lrf = np.concatenate((f, f2), axis=1) columnName += ['spatial lag'] if 'taxiflow' in features: if 'taxiflow' in logFeatures: f = np.concatenate((f, np.log(ftaxi)), axis=1) else: f = np.concatenate((f, ftaxi), axis=1) lrf = np.concatenate((f, ftaxi), axis=1) columnName += ['taxi flow'] if 'POIdist' in features: f = np.concatenate((f, poi_dist), axis=1) lrf = np.concatenate((f, poi_dist), axis=1) columnName += [ 'POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event' ] if 'temporallag' in features: f = np.concatenate((f, np.log(Yhat)), axis=1) lrf = np.concatenate((f, Yhat), axis=1) columnName += ['temporal lag'] nbres = NB_training_R(f, columnName, Y, region, verboseoutput) print NB_training_python(f, Y) mae2, var2, mre2 = LR_training_python(lrf, Y, verboseoutput) if verboseoutput: print "Linear Regression MAE", mae2, "std", var2, "MRE", mre2 else: print nbres print mae2, var2, mre2 return np.array([[float(ele) for ele in nbres.split(" ")], [mae2, var2, mre2]])
def permutationTest_accuracy(iters, permute='taxiflow'): """ Evaluate crime rate use full feature set: Corina, spaitallag, taxiflow, POIdist evaluate on 2013 at CA level leave one out permutation permute one feature 1000 times takes roughly 30-40 minutes. The results are dumped as "permute-{feature}.pickle" """ poi_dist = getFourSquarePOIDistribution(useRatio=False) F_taxi = getTaxiFlow(normalization="bydestination") W2 = generate_geographical_SpatialLag_ca() Y = retrieve_crime_count(year=2013) C = generate_corina_features() D = C[1] popul = C[1][:,0].reshape(C[1].shape[0],1) Y = np.divide(Y, popul) * 10000 f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) nb_mae = [] nb_mre = [] lr_mae = [] lr_mre = [] for i in range(iters): if permute == 'corina': D = np.random.permutation(D) elif permute == 'spatiallag': yhat = np.random.permutation(Y) f2 = np.dot(W2, yhat) elif permute == 'taxiflow': yhat = np.random.permutation(Y) ftaxi = np.dot(F_taxi, Y) elif permute == 'POIdist': poi_dist = np.random.permutation(poi_dist) f = np.ones(f2.shape) f = np.concatenate( (f, D, f2, ftaxi, poi_dist), axis=1 ) header = ['intercept'] + C[0] + [ 'spatiallag', 'taxiflow'] + \ ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] df = pd.DataFrame(f, columns=header) np.savetxt("Y.csv", Y, delimiter=",") df.to_csv("f.csv", sep=",", index=False) # NB permute nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca'] ) ls = nbres.split(' ') nb_mae.append( float(ls[0]) ) nb_mre.append( float(ls[2]) ) mae2, mre2 = permutation_Test_LR(Y, f) lr_mae.append(mae2) lr_mre.append(mre2) if i % 10 == 0: print i print '{0} iterations finished.'.format(iters) print pvalue(412.305, lr_mae), pvalue(0.363, lr_mre), \ pvalue(319.86, nb_mae), pvalue(0.281, nb_mre) return nb_mae, nb_mre, lr_mae, lr_mre
def leaveOneOut_evaluation_onChicagoCrimeData(year=2010, features= ["all"], crime_t=['total'], flow_type=0, verboseoutput=False, region='ca', weightSocialFlow=True, useRate=True, logFeatures = []): """ Generate the social lag from previous year use income/race/education of current year """ warnings.warn("The leave one out in nbr_eval.R is unfair") if 'sociallag' in features: W = generate_transition_SocialLag(year, lehd_type=flow_type, region=region, normalization='pair') # add POI distribution and taxi flow poi_dist = getFourSquarePOIDistribution(useRatio=False, gridLevel=region) F_taxi = getTaxiFlow(normalization="bydestination", gridLevel=region) if region == 'ca': W2 = generate_geographical_SpatialLag_ca() Yhat = retrieve_crime_count(year-1, col = crime_t) # h = retrieve_health_data() # Y = h[0].reshape((77,1)) Y = retrieve_crime_count(year, col = crime_t) C = generate_corina_features() popul = C[1][:,0].reshape(C[1].shape[0],1) if 'sociallag' in features: """ use poverty demographics to weight social lag """ wC = 28 # 130.0 if useRate else 32.0 # constant parameter if weightSocialFlow: poverty = C[1][:,2] for i in range(W.shape[0]): for j in range (W.shape[1]): W[i][j] *= np.exp( - np.abs(poverty[i] - poverty[j]) / wC ) # crime count is normalized by the total population as crime rate # here we use the crime count per 10 thousand residents if useRate: Y = np.divide(Y, popul) * 10000 Yhat = np.divide(Yhat, popul) * 10000 elif region == 'tract': W2, tractkey = generate_geographical_SpatialLag() Yhat_map = retrieve_crime_count(year-1, col = crime_t, region='tract') Yhat = np.array( [Yhat_map[k] for k in tractkey] ).reshape( len(Yhat_map), 1) Y_map = retrieve_crime_count(year, col = crime_t, region='tract') Y = np.array( [Y_map[k] for k in tractkey] ).reshape( len(Y_map), 1 ) C = generate_corina_features(region='tract') C_mtx = [] cnt = 0 for k in tractkey: if k in C[1]: C_mtx.append(C[1][k]) else: cnt += 1 C_mtx.append( [0 for i in range(7)] ) C = ( C[0], np.array( C_mtx ) ) # at tract level we don't normalize by population, since the tract is # defined as region with around 2000 population if useRate: pass i = retrieve_income_features() e = retrieve_education_features() r = retrieve_race_features() f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) # add intercept columnName = ['intercept'] f = np.ones(f2.shape) lrf = np.copy(f) if "all" in features: f = np.concatenate( (f, f1, i[1], e[1], r[1]), axis=1) f = pd.DataFrame(f, columns=['social lag'] + i[0] + e[0] + r[0]) if "sociallag" in features: f1 = np.dot(W, Y) if 'sociallag' in logFeatures: f = np.concatenate( (f, np.log(f1)), axis=1 ) else: f = np.concatenate( (f, f1), axis=1) lrf = np.concatenate( (f, f1), axis=1) columnName += ['social lag'] if "income" in features: f = np.concatenate( (f, i[1]), axis=1) lrf = np.concatenate( (f, i[1]), axis=1) columnName += i[0] if "race" in features: f = np.concatenate( (f, r[1]), axis=1) lrf = np.concatenate( (f, r[1]), axis=1) columnName += r[0] if "education" in features : f = np.concatenate( (f, e[1]), axis=1) lrf = np.concatenate( (f, e[1]), axis=1) columnName += e[0] if 'corina' in features : f = np.concatenate( (f, C[1]), axis=1) lrf = np.concatenate( (f, C[1]), axis=1) columnName += C[0] if 'spatiallag' in features: if 'spatiallag' in logFeatures: f = np.concatenate( (f, np.log(f2)), axis=1) else: f = np.concatenate( (f, f2), axis=1) lrf = np.concatenate( (f, f2), axis=1) columnName += ['spatial lag'] if 'taxiflow' in features: if 'taxiflow' in logFeatures: f = np.concatenate( (f, np.log(ftaxi)), axis=1 ) else: f = np.concatenate( (f, ftaxi), axis=1 ) lrf = np.concatenate( (f, ftaxi), axis=1 ) columnName += ['taxi flow'] if 'POIdist' in features: f = np.concatenate( (f, poi_dist), axis=1 ) lrf = np.concatenate( (f, poi_dist), axis=1 ) columnName += ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] if 'temporallag' in features: f = np.concatenate( (f, np.log(Yhat)), axis=1) lrf = np.concatenate( (f, Yhat), axis=1) columnName += ['temporal lag'] nbres = NB_training_R(f, columnName, Y, region, verboseoutput) print NB_training_python(f, Y) mae2, var2, mre2 = LR_training_python(lrf, Y, verboseoutput) if verboseoutput: print "Linear Regression MAE", mae2, "std", var2, "MRE", mre2 else: print nbres print mae2, var2, mre2 return np.array([[float(ele) for ele in nbres.split(" ")], [mae2, var2, mre2]])