def main_evaluate_different_years(year): import pickle Y, D, P, Tf, Gd = extract_raw_samples(year, crime_t=['total']) # use hourly crime rate as label # Yh = pickle.load(open("chicago-hourly-crime-{0}.pickle".format(year))) # Yh = Yh / D[:,0] * 10000 # use average income as label # header, income = retrieve_income_features() # Yh = np.repeat(income[:,0,None], 24, axis=1) # Yh = Yh.T # use average house price as label Yh = retrieve_averge_house_price() Yh = np.repeat(Yh[:,None], 24, axis=1) Yh = Yh.T assert Yh.shape == (24, N) MAE =[] MRE = [] for h in range(24): Tf = getTaxiFlow(filename="/taxi-CA-h{0}.matrix".format(h)) mae, mre = leaveOneOut_error(Yh[h,:].reshape((N,1)), D, P, Tf, Yh[h,:].reshape((N,1)), Gd, Yh[h,:].reshape((N,1)), features=['demo', 'poi'], taxi_norm="bydestination") print h, mae, mre MAE.append(mae) MRE.append(mre) print year, h, np.mean(MAE), np.mean(MRE) with open("kdd16-eval-{0}.pickle".format(year), "w") as fout: pickle.dump(MAE, fout) pickle.dump(MRE, fout)
def NB_coefficients(year=2010): poi_dist = getFourSquarePOIDistribution(useRatio=False) F_taxi = getTaxiFlow(normalization="bydestination") W2 = generate_geographical_SpatialLag_ca() Y = retrieve_crime_count(year=year) C = generate_corina_features() D = C[1] popul = C[1][:,0].reshape(C[1].shape[0],1) Y = np.divide(Y, popul) * 10000 f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 ) mms = MinMaxScaler(copy=False) mms.fit(f) mms.transform(f) header = C[0] + [ 'spatiallag', 'taxiflow'] + \ ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] df = pd.DataFrame(f, columns=header) np.savetxt("Y.csv", Y, delimiter=",") df.to_csv("f.csv", sep=",", index=False) # NB permute nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] ) print nbres ls = nbres.strip().split(" ") coef = [float(e) for e in ls] print coef return coef, header
def leaveOneOut_Input_v4(leaveOut): """ Generate observation matrix and vectors Y, F Those observations are trimed for the leave-one-out evaluation. Therefore, the leaveOut indicates the CA id to be left out, ranging from 1-77 """ des, X = generate_corina_features('ca') X = np.delete(X, leaveOut - 1, 0) popul = X[:, 0].reshape(X.shape[0], 1) pvt = X[:, 2] # poverty index of each CA # poi_cnt = getFourSquareCount(leaveOut) # poi_cnt = np.divide(poi_cnt, popul) * 10000 poi_dist = getFourSquarePOIDistribution(leaveOut) poi_dist = np.divide(poi_dist, popul) * 10000 F_dist = generate_geographical_SpatialLag_ca(leaveOut=leaveOut) F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca', leaveOut=leaveOut) F_taxi = getTaxiFlow(leaveOut=leaveOut) Y = retrieve_crime_count(year=2010, col=['total'], region='ca') Y = np.delete(Y, leaveOut - 1, 0) Y = np.divide(Y, popul) * 10000 F = [] n = Y.size Yd = [] for i in range(n): for j in range(n): if i != j: wij = np.array([ F_dist[i, j], actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i, j], F_taxi[i, j] ]) # fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]), 0) fij = np.concatenate((X[i], wij * Y[j][0]), 0) F.append(fij) Yd.append(Y[i]) F = np.array(F) np.append(F, np.ones((F.shape[0], 1)), axis=1) Yd = np.array(Yd) Yd.resize((Yd.size, 1)) return Yd, F
def leaveOneOut_Input_v4( leaveOut ): """ Generate observation matrix and vectors Y, F Those observations are trimed for the leave-one-out evaluation. Therefore, the leaveOut indicates the CA id to be left out, ranging from 1-77 """ des, X = generate_corina_features('ca') X = np.delete(X, leaveOut-1, 0) popul = X[:,0].reshape(X.shape[0],1) pvt = X[:,2] # poverty index of each CA # poi_cnt = getFourSquareCount(leaveOut) # poi_cnt = np.divide(poi_cnt, popul) * 10000 poi_dist = getFourSquarePOIDistribution(leaveOut) poi_dist = np.divide(poi_dist, popul) * 10000 F_dist = generate_geographical_SpatialLag_ca( leaveOut=leaveOut ) F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca', leaveOut=leaveOut) F_taxi = getTaxiFlow(leaveOut = leaveOut) Y = retrieve_crime_count(year=2010, col=['total'], region='ca') Y = np.delete(Y, leaveOut-1, 0) Y = np.divide(Y, popul) * 10000 F = [] n = Y.size Yd = [] for i in range(n): for j in range(n): if i != j: wij = np.array( [F_dist[i,j], actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i,j], F_taxi[i,j] ]) # fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]), 0) fij = np.concatenate( (X[i], wij * Y[j][0]), 0) F.append(fij) Yd.append(Y[i]) F = np.array(F) np.append(F, np.ones( (F.shape[0], 1) ), axis=1) Yd = np.array(Yd) Yd.resize( (Yd.size, 1) ) return Yd, F
def correlation_taxiflow_crime(flowPercentage=True, crimeRate=True): """ correlation between taxi flow and crime """ s = getTaxiFlow(usePercentage=flowPercentage) Y = retrieve_crime_count(2010, region='ca') if crimeRate: h, D = generate_corina_features(region='ca') popul = D[:,0].reshape(D.shape[0],1) Y = np.divide(Y, popul) * 10000 f1 = np.dot(s, Y) r = np.hstack( (f1, Y) ) r = np.transpose(r) pcc = np.corrcoef(r) print pcc
def generateInput_v4(fout=False): """ Generate complete observation matrix """ des, X = generate_corina_features('ca') pvt = X[:, 2] # poverty index of each CA popul = X[:, 0].reshape(X.shape[0], 1) # poi_cnt = getFourSquareCount() # poi_cnt = np.divide(poi_cnt, popul) * 10000 poi_dist = getFourSquarePOIDistribution() poi_dist = np.divide(poi_dist, popul) * 10000 F_dist = generate_geographical_SpatialLag_ca() F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca') F_taxi = getTaxiFlow() Y = retrieve_crime_count(year=2010, col=['total'], region='ca') Y = np.divide(Y, popul) * 10000 F = [] n = Y.size for i in range(n): for j in range(n): if i != j: wij = np.array([ F_dist[i, j], actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i, j], F_taxi[i, j] ]) # fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]) , 0) fij = np.concatenate((X[i], wij * Y[j, 0]), 0) F.append(fij) F = np.array(F) np.append(F, np.ones((F.shape[0], 1)), axis=1) if fout: np.savetxt('../matlab/F.csv', F, delimiter=',') return Y, F
def extract_raw_samples(year=2010, crime_t=['total'], crime_rate=True): """ Extract all samples with raw labels and features. Return None if the corresponding feature is not selected. This function is called once only to avoid unnecessary disk I/O. Input: year - which year to study crime_t - crime types of interest, e.g. 'total' crime_rate - predict crime_rate or not (count) Output: Y - crime rate / count D - demo feature P - POI feature Tf - taxi flow matrix (count) Gd - geo weight matrix """ # Crime count y_cnt = retrieve_crime_count(year, col = crime_t) # Crime rate / count demo = generate_corina_features() population = demo[1][:,0].reshape(demo[1].shape[0], 1) Y = y_cnt / population * 10000 if crime_rate else y_cnt assert(Y.shape == (77,1)) # Demo features D = demo[1] # POI features P = getFourSquarePOIDistribution(useRatio=False) # Taxi flow matrix Tf = getTaxiFlow(normalization="none") # Geo weight matrix Gd = generate_geographical_SpatialLag_ca() return Y, D, P, Tf, Gd
def extract_raw_samples(year=2010, crime_t=['total'], crime_rate=True): """ Extract all samples with raw labels and features. Return None if the corresponding feature is not selected. This function is called once only to avoid unnecessary disk I/O. Input: year - which year to study crime_t - crime types of interest, e.g. 'total' crime_rate - predict crime_rate or not (count) Output: Y - crime rate / count D - demo feature P - POI feature Tf - taxi flow matrix (count) Gd - geo weight matrix """ # Crime count y_cnt = retrieve_crime_count(year, col = crime_t) # Crime rate / count demo = generate_corina_features() population = demo[1][:,0].reshape(demo[1].shape[0], 1) Y = y_cnt / population * 10000 if crime_rate else y_cnt assert(Y.shape == (N,1)) # Demo features D = demo[1] # POI features P = getFourSquarePOIDistribution(useRatio=False) # Taxi flow matrix Tf = getTaxiFlow(normalization="none") # Geo weight matrix Gd = generate_geographical_SpatialLag_ca() return Y, D, P, Tf, Gd
def prepare_features(features=["poi", "taxi", "demos", "spatiallag"], leaveOneOut=-1): Y = retrieve_crime_count(year=2013) Y = Y.reshape((-1, 1)) if leaveOneOut > 0: Y = np.delete(Y, leaveOneOut - 1, 0) if "poi" in features: poi_dist = getFourSquareCount(leaveOut=leaveOneOut) if "taxi" in features: F_taxi = getTaxiFlow(leaveOut=leaveOneOut, normalization="bysource") if "demos" in features: C = generate_corina_features(leaveOut=leaveOneOut) demos = [ 'total population', 'population density', 'disadvantage index', 'residential stability', 'ethnic diversity' ] demos_idx = [C[0].index(ele) for ele in demos] D = C[1][:, demos_idx] popul = C[1][:, 0].reshape(C[1].shape[0], 1) Y = np.divide(Y, popul) * 10000 if "spatiallag" in features: W2 = generate_geographical_SpatialLag_ca(leaveOut=leaveOneOut) f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) f = np.ones(f2.shape) f = np.concatenate((f, D, f2, ftaxi, poi_dist), axis=1) S = np.zeros(W2.shape) S[W2 > 0] = 1 # S = np.ones(W2.shape) return Y, f, S
def generateInput_v4(fout=False): """ Generate complete observation matrix """ des, X = generate_corina_features('ca') pvt = X[:,2] # poverty index of each CA popul = X[:,0].reshape(X.shape[0],1) # poi_cnt = getFourSquareCount() # poi_cnt = np.divide(poi_cnt, popul) * 10000 poi_dist = getFourSquarePOIDistribution() poi_dist = np.divide(poi_dist, popul) * 10000 F_dist = generate_geographical_SpatialLag_ca() F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca') F_taxi = getTaxiFlow() Y = retrieve_crime_count(year=2010, col=['total'], region='ca') Y = np.divide(Y, popul) * 10000 F = [] n = Y.size for i in range(n): for j in range(n): if i != j: wij = np.array( [F_dist[i,j], actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i,j], F_taxi[i,j] ] ) # fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]) , 0) fij = np.concatenate( (X[i], wij * Y[j,0]) , 0) F.append(fij) F = np.array(F) np.append(F, np.ones( (F.shape[0], 1) ), axis=1) if fout: np.savetxt('../matlab/F.csv', F, delimiter=',') return Y, F
def prepare_features(features=["poi", "taxi", "demos", "spatiallag"], leaveOneOut=-1): Y = retrieve_crime_count(year=2013) Y = Y.reshape((-1,1)) if leaveOneOut > 0: Y = np.delete(Y, leaveOneOut-1, 0) if "poi" in features: poi_dist = getFourSquareCount(leaveOut=leaveOneOut) if "taxi" in features: F_taxi = getTaxiFlow(leaveOut=leaveOneOut, normalization="bysource") if "demos" in features: C = generate_corina_features(leaveOut=leaveOneOut) demos = ['total population', 'population density', 'disadvantage index', 'residential stability', 'ethnic diversity'] demos_idx = [C[0].index(ele) for ele in demos] D = C[1][:,demos_idx] popul = C[1][:,0].reshape(C[1].shape[0],1) Y = np.divide(Y, popul) * 10000 if "spatiallag" in features: W2 = generate_geographical_SpatialLag_ca(leaveOut=leaveOneOut) f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) f = np.ones(f2.shape) f = np.concatenate( (f, D, f2, ftaxi, poi_dist), axis=1 ) S = np.zeros(W2.shape) S[W2 > 0] = 1 # S = np.ones(W2.shape) return Y, f, S
def line_taxiflow_crime(): s = getTaxiFlow(normalization='bydestination') Y = retrieve_crime_count(2010, col=['total'], region='ca') h, D = generate_corina_features(region='ca') popul = D[:,0].reshape(D.shape[0],1) Y = np.divide(Y, popul) * 10000 f1 = np.dot(s, Y) plt.figure() plt.scatter(f1, Y) plt.axis([0, 6000, 0, 6000]) idx = [31, 46] sf1 = f1[idx] sY = Y[idx] plt.scatter(sf1, sY, edgecolors='red', s=50, linewidths=2 ) plt.figtext(0.33, 0.8, '#32', fontsize='large') plt.figtext(0.75, 0.34, '#47', fontsize='large') plt.xlabel('Hyperlink by taxi flow feature value', fontsize='x-large') plt.ylabel('Crime rate', fontsize='x-large') plt.savefig('taxi-flow-percent.pdf', format='pdf') return f1
def coefficients_pvalue(lagsFlag, tempflag="templag", selfflow="selfflow", itersN="10", exposure="exposure", year=2010, lehdType="total", crimeType='total'): """Return the pvalue of Negative Binomial model coefficients. Permutation test + leave-one-out evaluation Retrieve leave-one-out error distribution. To determine the p-value The model to be evaluated is the NB model. The features used in this model only includes spatial lag, scial lag, and demographics. Keyword arguments: lehdType -- the type of LEHD flow (default "total", alternative "lowincome") crimeType -- the type of predicated crime (default "violent", alternative "total") exposure -- exposure or noexposure lagsFlag -- social lag, spatial lag, socai lag disadv, spatial lag disadv """ C = generate_corina_features('ca') demo = pd.DataFrame(data=C[1], columns=C[0], dtype="float") W1 = generate_geographical_SpatialLag_ca() # the LEHD type if lehdType == "lowincome": W2 = generate_transition_SocialLag(year=year, lehd_type=4, region='ca', normalization='none') elif lehdType == "total": W2 = generate_transition_SocialLag(year=year, lehd_type=0, region='ca', normalization='none') elif lehdType == "taxi": W2 = getTaxiFlow(normalization="none") if selfflow == 'selfflow': s = [W2[i, i] for i in range(W2.shape[0])] np.savetxt(here + "/../R/pvalue-selfflow.csv", s) for i in range(W2.shape[0]): W2[i, i] = 0 # the predicated crime type violentCrime = [ 'HOMICIDE', 'CRIM SEXUAL ASSAULT', 'BATTERY', 'ROBBERY', 'ARSON', 'DOMESTIC VIOLENCE', 'ASSAULT' ] if crimeType == 'total': Y = retrieve_crime_count(year=year, col=['total'], region='ca') if tempflag == "templag": ystart = (year - 3) if year - 3 >= 2003 else 2003 tlag = [] for ytmp in range(ystart, year): yt = retrieve_crime_count(year=ytmp, col=['total'], region='ca') tlag.append(yt) yt = np.mean(tlag, axis=0) assert yt.shape == Y.shape np.savetxt(here + "/../R/pvalue-templag.csv", yt) elif crimeType == 'violent': Y = retrieve_crime_count(year=year, col=violentCrime, region='ca') if tempflag == "templag": ystart = (year - 3) if year - 3 >= 2003 else 2003 tlag = [] for ytmp in range(ystart, year): yt = retrieve_crime_count(year=ytmp, col=violentCrime, region='ca') tlag.append(yt) yt = np.mean(tlag, axis=0) assert yt.shape == Y.shape np.savetxt(here + "/../R/pvalue-templag.csv", yt) demo.to_csv(here + "/../R/pvalue-demo.csv", index=False) np.savetxt(here + "/../R/pvalue-spatiallag.csv", W1, delimiter=",") np.savetxt(here + "/../R/pvalue-sociallag.csv", W2, delimiter=",") np.savetxt(here + "/../R/pvalue-crime.csv", Y) # use a multiprocess Pool to run subprocess in parallel socialNorm = ['bydestination', 'bysource', 'bypair'] os.chdir(here + "/../R") from multiprocessing import Pool, cpu_count subProcessPool = Pool(cpu_count() / 2) for sn in socialNorm[1:2]: for logpop in ["logpop", "pop"][0:1]: for logpopden in ["logpopdensty", "popdensty"][0:1]: subProcessPool.apply_async( subPworker, (lehdType, crimeType, sn, exposure, logpop, lagsFlag, itersN, logpopden, tempflag, selfflow)) # p = subprocess.Popen(['Rscript', 'pvalue-evaluation.R', # lehdType+"lehd", crimeType+"crime", sn, # exposure, logpop, lagsFlag, itersN, # logpopden, tempflag, selfflow]) # p.wait() subProcessPool.close() subProcessPool.join()
def permutationTest_accuracy(iters, permute='taxiflow'): """ Evaluate crime rate use full feature set: Corina, spaitallag, taxiflow, POIdist evaluate on 2013 at CA level leave one out permutation permute one feature 1000 times takes roughly 30-40 minutes. The results are dumped as "permute-{feature}.pickle" """ poi_dist = getFourSquarePOIDistribution(useRatio=False) F_taxi = getTaxiFlow(normalization="bydestination") W2 = generate_geographical_SpatialLag_ca() Y = retrieve_crime_count(year=2013) C = generate_corina_features() D = C[1] popul = C[1][:, 0].reshape(C[1].shape[0], 1) Y = np.divide(Y, popul) * 10000 f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) nb_mae = [] nb_mre = [] lr_mae = [] lr_mre = [] for i in range(iters): if permute == 'corina': D = np.random.permutation(D) elif permute == 'spatiallag': yhat = np.random.permutation(Y) f2 = np.dot(W2, yhat) elif permute == 'taxiflow': yhat = np.random.permutation(Y) ftaxi = np.dot(F_taxi, Y) elif permute == 'POIdist': poi_dist = np.random.permutation(poi_dist) f = np.ones(f2.shape) f = np.concatenate((f, D, f2, ftaxi, poi_dist), axis=1) header = ['intercept'] + C[0] + [ 'spatiallag', 'taxiflow'] + \ ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] df = pd.DataFrame(f, columns=header) np.savetxt("Y.csv", Y, delimiter=",") df.to_csv("f.csv", sep=",", index=False) # NB permute nbres = subprocess.check_output(['Rscript', 'nbr_eval.R', 'ca']) ls = nbres.split(' ') nb_mae.append(float(ls[0])) nb_mre.append(float(ls[2])) mae2, mre2 = permutation_Test_LR(Y, f) lr_mae.append(mae2) lr_mre.append(mre2) if i % 10 == 0: print i print '{0} iterations finished.'.format(iters) print pvalue(412.305, lr_mae), pvalue(0.363, lr_mre), \ pvalue(319.86, nb_mae), pvalue(0.281, nb_mre) return nb_mae, nb_mre, lr_mae, lr_mre
def leaveOneOut_evaluation_onChicagoCrimeData(year=2010, features=["all"], crime_t=['total'], flow_type=0, verboseoutput=False, region='ca', weightSocialFlow=True, useRate=True, logFeatures=[]): """ Generate the social lag from previous year use income/race/education of current year """ warnings.warn("The leave one out in nbr_eval.R is unfair") if 'sociallag' in features: W = generate_transition_SocialLag(year, lehd_type=flow_type, region=region, normalization='pair') # add POI distribution and taxi flow poi_dist = getFourSquarePOIDistribution(useRatio=False, gridLevel=region) F_taxi = getTaxiFlow(normalization="bydestination", gridLevel=region) if region == 'ca': W2 = generate_geographical_SpatialLag_ca() Yhat = retrieve_crime_count(year - 1, col=crime_t) # h = retrieve_health_data() # Y = h[0].reshape((77,1)) Y = retrieve_crime_count(year, col=crime_t) C = generate_corina_features() popul = C[1][:, 0].reshape(C[1].shape[0], 1) if 'sociallag' in features: """ use poverty demographics to weight social lag """ wC = 28 # 130.0 if useRate else 32.0 # constant parameter if weightSocialFlow: poverty = C[1][:, 2] for i in range(W.shape[0]): for j in range(W.shape[1]): W[i][j] *= np.exp(-np.abs(poverty[i] - poverty[j]) / wC) # crime count is normalized by the total population as crime rate # here we use the crime count per 10 thousand residents if useRate: Y = np.divide(Y, popul) * 10000 Yhat = np.divide(Yhat, popul) * 10000 elif region == 'tract': W2, tractkey = generate_geographical_SpatialLag() Yhat_map = retrieve_crime_count(year - 1, col=crime_t, region='tract') Yhat = np.array([Yhat_map[k] for k in tractkey]).reshape(len(Yhat_map), 1) Y_map = retrieve_crime_count(year, col=crime_t, region='tract') Y = np.array([Y_map[k] for k in tractkey]).reshape(len(Y_map), 1) C = generate_corina_features(region='tract') C_mtx = [] cnt = 0 for k in tractkey: if k in C[1]: C_mtx.append(C[1][k]) else: cnt += 1 C_mtx.append([0 for i in range(7)]) C = (C[0], np.array(C_mtx)) # at tract level we don't normalize by population, since the tract is # defined as region with around 2000 population if useRate: pass i = retrieve_income_features() e = retrieve_education_features() r = retrieve_race_features() f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) # add intercept columnName = ['intercept'] f = np.ones(f2.shape) lrf = np.copy(f) if "all" in features: f = np.concatenate((f, f1, i[1], e[1], r[1]), axis=1) f = pd.DataFrame(f, columns=['social lag'] + i[0] + e[0] + r[0]) if "sociallag" in features: f1 = np.dot(W, Y) if 'sociallag' in logFeatures: f = np.concatenate((f, np.log(f1)), axis=1) else: f = np.concatenate((f, f1), axis=1) lrf = np.concatenate((f, f1), axis=1) columnName += ['social lag'] if "income" in features: f = np.concatenate((f, i[1]), axis=1) lrf = np.concatenate((f, i[1]), axis=1) columnName += i[0] if "race" in features: f = np.concatenate((f, r[1]), axis=1) lrf = np.concatenate((f, r[1]), axis=1) columnName += r[0] if "education" in features: f = np.concatenate((f, e[1]), axis=1) lrf = np.concatenate((f, e[1]), axis=1) columnName += e[0] if 'corina' in features: f = np.concatenate((f, C[1]), axis=1) lrf = np.concatenate((f, C[1]), axis=1) columnName += C[0] if 'spatiallag' in features: if 'spatiallag' in logFeatures: f = np.concatenate((f, np.log(f2)), axis=1) else: f = np.concatenate((f, f2), axis=1) lrf = np.concatenate((f, f2), axis=1) columnName += ['spatial lag'] if 'taxiflow' in features: if 'taxiflow' in logFeatures: f = np.concatenate((f, np.log(ftaxi)), axis=1) else: f = np.concatenate((f, ftaxi), axis=1) lrf = np.concatenate((f, ftaxi), axis=1) columnName += ['taxi flow'] if 'POIdist' in features: f = np.concatenate((f, poi_dist), axis=1) lrf = np.concatenate((f, poi_dist), axis=1) columnName += [ 'POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event' ] if 'temporallag' in features: f = np.concatenate((f, np.log(Yhat)), axis=1) lrf = np.concatenate((f, Yhat), axis=1) columnName += ['temporal lag'] nbres = NB_training_R(f, columnName, Y, region, verboseoutput) print NB_training_python(f, Y) mae2, var2, mre2 = LR_training_python(lrf, Y, verboseoutput) if verboseoutput: print "Linear Regression MAE", mae2, "std", var2, "MRE", mre2 else: print nbres print mae2, var2, mre2 return np.array([[float(ele) for ele in nbres.split(" ")], [mae2, var2, mre2]])
def coefficients_pvalue(lagsFlag, tempflag="templag", selfflow="selfflow", itersN="10", exposure="exposure", year=2010, lehdType="total", crimeType='total'): """Return the pvalue of Negative Binomial model coefficients. Permutation test + leave-one-out evaluation Retrieve leave-one-out error distribution. To determine the p-value The model to be evaluated is the NB model. The features used in this model only includes spatial lag, scial lag, and demographics. Keyword arguments: lehdType -- the type of LEHD flow (default "total", alternative "lowincome") crimeType -- the type of predicated crime (default "violent", alternative "total") exposure -- exposure or noexposure lagsFlag -- social lag, spatial lag, socai lag disadv, spatial lag disadv """ C = generate_corina_features('ca') demo = pd.DataFrame(data=C[1], columns=C[0], dtype="float") W1 = generate_geographical_SpatialLag_ca() # the LEHD type if lehdType == "lowincome": W2 = generate_transition_SocialLag(year=year, lehd_type=4, region='ca', normalization='none') elif lehdType == "total": W2 = generate_transition_SocialLag(year=year, lehd_type=0, region='ca', normalization='none') elif lehdType == "taxi": W2 = getTaxiFlow(normalization="none") if selfflow == 'selfflow': s = [W2[i,i] for i in range(W2.shape[0])] np.savetxt(here + "/../R/pvalue-selfflow.csv", s) for i in range(W2.shape[0]): W2[i,i] = 0 # the predicated crime type violentCrime = ['HOMICIDE', 'CRIM SEXUAL ASSAULT', 'BATTERY', 'ROBBERY', 'ARSON', 'DOMESTIC VIOLENCE', 'ASSAULT'] if crimeType == 'total': Y = retrieve_crime_count(year=year, col=['total'], region='ca') if tempflag == "templag": ystart = (year-3) if year - 3 >= 2003 else 2003 tlag = [] for ytmp in range(ystart, year): yt = retrieve_crime_count(year=ytmp, col=['total'], region='ca') tlag.append(yt) yt = np.mean(tlag, axis=0) assert yt.shape == Y.shape np.savetxt(here + "/../R/pvalue-templag.csv", yt) elif crimeType == 'violent': Y = retrieve_crime_count(year=year, col=violentCrime, region='ca') if tempflag == "templag": ystart = (year-3) if year - 3 >= 2003 else 2003 tlag = [] for ytmp in range(ystart, year): yt = retrieve_crime_count(year=ytmp, col=violentCrime, region='ca') tlag.append(yt) yt = np.mean(tlag, axis=0) assert yt.shape == Y.shape np.savetxt(here + "/../R/pvalue-templag.csv", yt) demo.to_csv(here + "/../R/pvalue-demo.csv", index=False) np.savetxt(here + "/../R/pvalue-spatiallag.csv", W1, delimiter=",") np.savetxt(here + "/../R/pvalue-sociallag.csv", W2, delimiter=",") np.savetxt(here + "/../R/pvalue-crime.csv", Y) # use a multiprocess Pool to run subprocess in parallel socialNorm = ['bydestination', 'bysource', 'bypair'] os.chdir(here + "/../R") from multiprocessing import Pool, cpu_count subProcessPool = Pool(cpu_count() / 2) for sn in socialNorm[1:2]: for logpop in ["logpop", "pop"][0:1]: for logpopden in ["logpopdensty", "popdensty"][0:1]: subProcessPool.apply_async(subPworker, (lehdType, crimeType, sn, exposure, logpop, lagsFlag, itersN, logpopden, tempflag, selfflow)) # p = subprocess.Popen(['Rscript', 'pvalue-evaluation.R', # lehdType+"lehd", crimeType+"crime", sn, # exposure, logpop, lagsFlag, itersN, # logpopden, tempflag, selfflow]) # p.wait() subProcessPool.close() subProcessPool.join()
def permutationTest_accuracy(iters, permute='taxiflow'): """ Evaluate crime rate use full feature set: Corina, spaitallag, taxiflow, POIdist evaluate on 2013 at CA level leave one out permutation permute one feature 1000 times takes roughly 30-40 minutes. The results are dumped as "permute-{feature}.pickle" """ poi_dist = getFourSquarePOIDistribution(useRatio=False) F_taxi = getTaxiFlow(normalization="bydestination") W2 = generate_geographical_SpatialLag_ca() Y = retrieve_crime_count(year=2013) C = generate_corina_features() D = C[1] popul = C[1][:,0].reshape(C[1].shape[0],1) Y = np.divide(Y, popul) * 10000 f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) nb_mae = [] nb_mre = [] lr_mae = [] lr_mre = [] for i in range(iters): if permute == 'corina': D = np.random.permutation(D) elif permute == 'spatiallag': yhat = np.random.permutation(Y) f2 = np.dot(W2, yhat) elif permute == 'taxiflow': yhat = np.random.permutation(Y) ftaxi = np.dot(F_taxi, Y) elif permute == 'POIdist': poi_dist = np.random.permutation(poi_dist) f = np.ones(f2.shape) f = np.concatenate( (f, D, f2, ftaxi, poi_dist), axis=1 ) header = ['intercept'] + C[0] + [ 'spatiallag', 'taxiflow'] + \ ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] df = pd.DataFrame(f, columns=header) np.savetxt("Y.csv", Y, delimiter=",") df.to_csv("f.csv", sep=",", index=False) # NB permute nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca'] ) ls = nbres.split(' ') nb_mae.append( float(ls[0]) ) nb_mre.append( float(ls[2]) ) mae2, mre2 = permutation_Test_LR(Y, f) lr_mae.append(mae2) lr_mre.append(mre2) if i % 10 == 0: print i print '{0} iterations finished.'.format(iters) print pvalue(412.305, lr_mae), pvalue(0.363, lr_mre), \ pvalue(319.86, nb_mae), pvalue(0.281, nb_mre) return nb_mae, nb_mre, lr_mae, lr_mre
def leaveOneOut_evaluation_onChicagoCrimeData(year=2010, features= ["all"], crime_t=['total'], flow_type=0, verboseoutput=False, region='ca', weightSocialFlow=True, useRate=True, logFeatures = []): """ Generate the social lag from previous year use income/race/education of current year """ warnings.warn("The leave one out in nbr_eval.R is unfair") if 'sociallag' in features: W = generate_transition_SocialLag(year, lehd_type=flow_type, region=region, normalization='pair') # add POI distribution and taxi flow poi_dist = getFourSquarePOIDistribution(useRatio=False, gridLevel=region) F_taxi = getTaxiFlow(normalization="bydestination", gridLevel=region) if region == 'ca': W2 = generate_geographical_SpatialLag_ca() Yhat = retrieve_crime_count(year-1, col = crime_t) # h = retrieve_health_data() # Y = h[0].reshape((77,1)) Y = retrieve_crime_count(year, col = crime_t) C = generate_corina_features() popul = C[1][:,0].reshape(C[1].shape[0],1) if 'sociallag' in features: """ use poverty demographics to weight social lag """ wC = 28 # 130.0 if useRate else 32.0 # constant parameter if weightSocialFlow: poverty = C[1][:,2] for i in range(W.shape[0]): for j in range (W.shape[1]): W[i][j] *= np.exp( - np.abs(poverty[i] - poverty[j]) / wC ) # crime count is normalized by the total population as crime rate # here we use the crime count per 10 thousand residents if useRate: Y = np.divide(Y, popul) * 10000 Yhat = np.divide(Yhat, popul) * 10000 elif region == 'tract': W2, tractkey = generate_geographical_SpatialLag() Yhat_map = retrieve_crime_count(year-1, col = crime_t, region='tract') Yhat = np.array( [Yhat_map[k] for k in tractkey] ).reshape( len(Yhat_map), 1) Y_map = retrieve_crime_count(year, col = crime_t, region='tract') Y = np.array( [Y_map[k] for k in tractkey] ).reshape( len(Y_map), 1 ) C = generate_corina_features(region='tract') C_mtx = [] cnt = 0 for k in tractkey: if k in C[1]: C_mtx.append(C[1][k]) else: cnt += 1 C_mtx.append( [0 for i in range(7)] ) C = ( C[0], np.array( C_mtx ) ) # at tract level we don't normalize by population, since the tract is # defined as region with around 2000 population if useRate: pass i = retrieve_income_features() e = retrieve_education_features() r = retrieve_race_features() f2 = np.dot(W2, Y) ftaxi = np.dot(F_taxi, Y) # add intercept columnName = ['intercept'] f = np.ones(f2.shape) lrf = np.copy(f) if "all" in features: f = np.concatenate( (f, f1, i[1], e[1], r[1]), axis=1) f = pd.DataFrame(f, columns=['social lag'] + i[0] + e[0] + r[0]) if "sociallag" in features: f1 = np.dot(W, Y) if 'sociallag' in logFeatures: f = np.concatenate( (f, np.log(f1)), axis=1 ) else: f = np.concatenate( (f, f1), axis=1) lrf = np.concatenate( (f, f1), axis=1) columnName += ['social lag'] if "income" in features: f = np.concatenate( (f, i[1]), axis=1) lrf = np.concatenate( (f, i[1]), axis=1) columnName += i[0] if "race" in features: f = np.concatenate( (f, r[1]), axis=1) lrf = np.concatenate( (f, r[1]), axis=1) columnName += r[0] if "education" in features : f = np.concatenate( (f, e[1]), axis=1) lrf = np.concatenate( (f, e[1]), axis=1) columnName += e[0] if 'corina' in features : f = np.concatenate( (f, C[1]), axis=1) lrf = np.concatenate( (f, C[1]), axis=1) columnName += C[0] if 'spatiallag' in features: if 'spatiallag' in logFeatures: f = np.concatenate( (f, np.log(f2)), axis=1) else: f = np.concatenate( (f, f2), axis=1) lrf = np.concatenate( (f, f2), axis=1) columnName += ['spatial lag'] if 'taxiflow' in features: if 'taxiflow' in logFeatures: f = np.concatenate( (f, np.log(ftaxi)), axis=1 ) else: f = np.concatenate( (f, ftaxi), axis=1 ) lrf = np.concatenate( (f, ftaxi), axis=1 ) columnName += ['taxi flow'] if 'POIdist' in features: f = np.concatenate( (f, poi_dist), axis=1 ) lrf = np.concatenate( (f, poi_dist), axis=1 ) columnName += ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 'POI outdoors recreation', 'POI education', 'POI nightlife', 'POI professional', 'POI shops', 'POI event'] if 'temporallag' in features: f = np.concatenate( (f, np.log(Yhat)), axis=1) lrf = np.concatenate( (f, Yhat), axis=1) columnName += ['temporal lag'] nbres = NB_training_R(f, columnName, Y, region, verboseoutput) print NB_training_python(f, Y) mae2, var2, mre2 = LR_training_python(lrf, Y, verboseoutput) if verboseoutput: print "Linear Regression MAE", mae2, "std", var2, "MRE", mre2 else: print nbres print mae2, var2, mre2 return np.array([[float(ele) for ele in nbres.split(" ")], [mae2, var2, mre2]])
for h in range(24): Fn = similarityMatrix(hdge[h]) x, y, xp, yp, lp = generate_point(Fn, Y) f = plt.figure() plt.scatter(x, y, color='red') plt.show() demo = generate_corina_features() y_cnt = retrieve_crime_count(2013) population = demo[1][:, 0].reshape(demo[1].shape[0], 1) Y = y_cnt / population * 10000 F = getTaxiFlow(normalization="none") x, y, xp, yp, lp = generate_point(F, Y) plt.rc("axes", linewidth=2) f = plt.figure(figsize=(8, 6)) plt.scatter(x, y, s=16) plt.plot([-100, -100, 3500, -100], [3000, -3000, 0, 3000], linewidth=2, color='blue') plt.scatter(xp, yp, color='red', s=28) plt.xlabel("Taxi flow from $r_i$ to $r_j$", fontsize=20) plt.ylabel("Crime rate difference $y_i - y_j$", fontsize=20) #for i in range(len(lp)): # a.axes.annotate(lp[i], xy=(xp[i], yp[i]))