def generateTaxiFlow(gridLevel='ca'): """ Generate taxi flow and write it to a file This is slow to run """ if gridLevel == 'ca': cas = Tract.createAllCAObjects() elif gridLevel == 'tract': cas = Tract.createAllTractObjects() n = len(cas) TF = np.zeros((n, n)) # taxi flow matrix ordKey = sorted(cas.keys()) cnt = 0 # import os # fnames = os.listdir("../data/ChicagoTaxi/") fnames = ['201401-03.txt'] for fname in fnames: print "Count taxi flow in {0}".format(fname) with open('../data/ChicagoTaxi/{0}'.format(fname), 'rU') as fin: reader = csv.reader(fin, delimiter='\t' ) header = reader.next() for row in reader: # initialize points start = Point(float(row[3]), float(row[4])) end = Point(float(row[5]), float(row[6])) sid = -1 eid = -1 for key, grid in cas.items(): """ grid key starts from 1 map the start/end point of trip into grids to get flow """ if grid.polygon.contains(start): sid = ordKey.index(key) if grid.polygon.contains(end): eid = ordKey.index(key) if sid != -1 and eid != -1: break TF[sid, eid] += 1 cnt += 1 if (cnt % 100000 == 0): print "{0} trips have been added".format(cnt) if gridLevel == 'ca': np.savetxt(here + "/TF.csv", TF, delimiter="," ) elif gridLevel == 'tract': np.savetxt(here + "/TF_tract.csv", TF, delimiter="," )
def generate_geographical_SpatialLag_ca(knearest=True, leaveOut=-1): """ Generate the distance matrix for CA pairs. If knearest is true, then select the 6-nearest neighboring CAs. Else, return the distance to all other CAs. leaveOut will select the CA and remove it. take value from 1 to 77 """ cas = Tract.createAllCAObjects() centers = [] iset = range(1, 78) if leaveOut > 0: iset.remove(leaveOut) for i in iset: centers.append(cas[i].polygon.centroid) W = np.zeros((len(iset), len(iset))) for i, src in enumerate(centers): for j, dst in enumerate(centers): if src != dst: W[i][j] = 1 / src.distance(dst) # find n-largest (n=6) if knearest == True: threshold = heapq.nlargest(6, W[i, :])[-1] for j in range(len(W[i, :])): W[i][j] = 0 if W[i][j] < threshold else W[i][j] return W
def get_centroid_ca(): cas = Tract.createAllCAObjects() centers = [] for i in range(1, 78): ctd = cas[i].polygon.centroid centers.append([ctd.x, ctd.y]) return centers
def generate_geographical_SpatialLag_ca(knearest=True, leaveOut=-1): """ Generate the distance matrix for CA pairs. If knearest is true, then select the 6-nearest neighboring CAs. Else, return the distance to all other CAs. leaveOut will select the CA and remove it. take value from 1 to 77 """ cas = Tract.createAllCAObjects() centers = [] iset = range(1, 78) if leaveOut > 0: iset.remove(leaveOut) for i in iset: centers.append(cas[i].polygon.centroid) W = np.zeros( (len(iset),len(iset)) ) for i, src in enumerate(centers): for j, dst in enumerate(centers): if src != dst: W[i][j] = 1 / src.distance(dst) # find n-largest (n=6) if knearest == True: threshold = heapq.nlargest(6, W[i,:])[-1] for j in range(len(W[i,:])): W[i][j] = 0 if W[i][j] < threshold else W[i][j] return W
def generate_geographical_SpatialLag(): """ Generate the spatial lag from the geographically adjacent regions. """ ts = Tract.createAllTractObjects() ordkey = sorted(ts) centers = [ ts[k].polygon.centroid for k in ordkey ] W = np.zeros( (len(centers), len(centers)) ) for i, src in enumerate(centers): for j, dst in enumerate(centers): if src != dst: W[i][j] = 1 / src.distance(dst) return W, ordkey
def generate_geographical_SpatialLag(): """ Generate the spatial lag from the geographically adjacent regions. """ ts = Tract.createAllTractObjects() ordkey = sorted(ts) centers = [ts[k].polygon.centroid for k in ordkey] W = np.zeros((len(centers), len(centers))) for i, src in enumerate(centers): for j, dst in enumerate(centers): if src != dst: W[i][j] = 1 / src.distance(dst) return W, ordkey
def generate_GWR_weight(h = 1): """ Generate the GWR weighting matrix with exponential function. """ cas = Tract.createAllCAObjects() centers = [] for i in range(1, 78): centers.append(cas[i].polygon.centroid) gamma = np.ones((len(centers), len(centers))) for i, src in enumerate(centers): for j, dst in enumerate(centers): if i != j: gamma[i][j] = np.exp(-0.5 * src.distance(dst)**2 / h**2) return gamma
def generate_GWR_weight(h=1): """ Generate the GWR weighting matrix with exponential function. """ cas = Tract.createAllCAObjects() centers = [] for i in range(1, 78): centers.append(cas[i].polygon.centroid) gamma = np.ones((len(centers), len(centers))) for i, src in enumerate(centers): for j, dst in enumerate(centers): if i != j: gamma[i][j] = np.exp(-0.5 * src.distance(dst)**2 / h**2) return gamma
def correlation_POIdist_crime(): """ we calculate the correlation between POI distribution and crime for each community area(CA). Within each CA, the crime count is number of crime in each tract. The POI count is number of POIs in each tract. """ tracts = Tract.createAllTractObjects() ordkey = sorted(tracts.keys()) CAs = {} for key, val in tracts.items(): if val.CA not in CAs: CAs[val.CA] = [key] else: CAs[val.CA].append(key) Y = retrieve_crime_count(2010, col=['total'], region='tract') poi_dist = getFourSquarePOIDistribution(gridLevel='tract') Pearson = {} for cakey, calist in CAs.items(): crime = [] pois = [] for tractkey in calist: crime.append(Y[tractkey]) pois.append(poi_dist[ordkey.index(tractkey)]) # calculate correlation pois = np.array(pois) crime = np.array(crime) pearson = [] for i in range(pois.shape[1]): r = np.vstack( (pois[:,i], crime) ) pearson.append( np.corrcoef(r)[0,1] ) Pearson[cakey] = np.nan_to_num( pearson ) P = [] for key in range(1, 78): P.append(Pearson[key]) np.savetxt("../R/poi_correlation_ca.csv", P, delimiter=",") return np.array(P)
def visualize_prediction_error(self, er, Y, title): cas = Tract.createAllCAObjects() import matplotlib.pyplot as plt import descartes fig = plt.figure() ax = fig.add_subplot(111) for k in cas: re = er[k - 1] / Y[k - 1] if re > 0.4: c = 'r' elif re < -0.4: c = 'b' else: c = 'w' cak = cas[k].polygon ax.add_patch(descartes.PolygonPatch(cak, fc=c)) ax.annotate(str(k), [cak.centroid.x, cak.centroid.y]) ax.axis('equal') ax.set_title(title) fig.show()
def visualize_prediction_error(self, er, Y, title): cas = Tract.createAllCAObjects() import matplotlib.pyplot as plt import descartes fig = plt.figure() ax = fig.add_subplot(111) for k in cas: re = er[k-1] / Y[k-1] if re > 0.4: c = 'r' elif re < -0.4: c = 'b' else: c = 'w' cak = cas[k].polygon ax.add_patch(descartes.PolygonPatch(cak, fc=c)) ax.annotate(str(k), [cak.centroid.x, cak.centroid.y]) ax.axis('equal') ax.set_title(title) fig.show()
def CA_clustering_with_embedding(): ge = get_graph_embedding_features("geo_all.txt") from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=6, max_iter=100).fit(ge) for idx, lab in enumerate(kmeans.labels_): print idx+1, lab colorMaps = ['blue', 'red', 'g', 'c', 'y', 'm', 'k', 'w'] cas = Tract.createAllCAObjects() import matplotlib.pyplot as plt import descartes fig = plt.figure() ax = fig.add_subplot(111) for k in cas: cak = cas[k].polygon ax.add_patch(descartes.PolygonPatch(cak, fc=colorMaps[kmeans.labels_[k-1]])) ax.annotate(str(k), [cak.centroid.x, cak.centroid.y]) ax.axis('equal') fig.show() return kmeans, cas
def CA_clustering_with_embedding(): ge = get_graph_embedding_features("geo_all.txt") from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=4, max_iter=100).fit(ge) for idx, lab in enumerate(kmeans.labels_): print idx + 1, lab colorMaps = ['blue', 'red', 'g', 'c', 'y', 'm', 'k', 'w'] cas = Tract.createAllCAObjects() import matplotlib.pyplot as plt import descartes fig = plt.figure() ax = fig.add_subplot(111) for k in cas: cak = cas[k].polygon ax.add_patch( descartes.PolygonPatch(cak, fc=colorMaps[kmeans.labels_[k - 1]])) ax.annotate(str(k), [cak.centroid.x, cak.centroid.y]) ax.axis('equal') fig.show() return kmeans, cas
def correlation_POI_crime(gridLevel='tract', poiRatio=False): """ calculate correlation for different POI category """ Y = retrieve_crime_count(2010, col=['total'], region=gridLevel) h, D = generate_corina_features(region='ca') popul = D[:,0].reshape(D.shape[0],1) poi_dist = getFourSquarePOIDistribution(gridLevel=gridLevel, useRatio=poiRatio) cate_label = ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 'Outdoors & Recreation', 'College & Education', 'Nightlife', 'Professional', 'Shops', 'Event'] if gridLevel == 'tract': tracts = Tract.createAllTractObjects() ordkey = sorted(tracts.keys()) crime = [] pois = [] for tractkey in ordkey: crime.append(Y[tractkey]) pois.append(poi_dist[ordkey.index(tractkey)]) pois = np.array(pois) crime = np.array(crime) for i in range(pois.shape[1]): r = np.vstack( (pois[:,i], crime) ) pcc = np.corrcoef(r)[0,1] print pcc elif gridLevel == 'ca': Y = np.divide(Y, popul) * 10000 Y = Y.reshape( (len(Y),) ) poi_dist = np.transpose(poi_dist) for i in range(poi_dist.shape[0]): poi = np.reshape(poi_dist[i,:], Y.shape ) r, p = pearsonr(poi, Y) print cate_label[i], r, p
def generate_transition_SocialLag(year=2010, lehd_type=0, region='ca', leaveOut=-1, normalization='source'): """ Generate the spatial lag matrix from the transition flow connected CAs. 0 - #total jobs 1 - #jobs age under 29, 2 - #jobs age from 30 to 54, 3 - #jobs above 55, 4 - #jobs earning under $1250/month, 5 - #jobs earnings from $1251 to $3333/month, 6 - #jobs above $3333/month, 7 - #jobs in goods producing, 8 - #jobs in trade transportation, 9 - #jobs in other services """ if region == 'ca': ts = Tract.createAllCAObjects() fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year) elif region == 'tract': ts = Tract.createAllTractObjects() fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year) ordkey = sorted(ts.keys()) listIdx = {} fin = open(fn) for line in fin: ls = line.split(",") srcid = int(ls[0][5:]) dstid = int(ls[1][5:]) val = int(ls[2 + lehd_type]) if srcid in listIdx: listIdx[srcid][dstid] = val else: listIdx[srcid] = {} listIdx[srcid][dstid] = val fin.close() if leaveOut > 0: ordkey.remove(leaveOut) W = np.zeros((len(ts), len(ts))) for srcid in ordkey: if srcid in listIdx: sdict = listIdx[srcid] if leaveOut in sdict: del sdict[leaveOut] for dstid, val in sdict.items(): W[ordkey.index(srcid)][ordkey.index(dstid)] = val else: W[ordkey.index(srcid)] = np.zeros((1, len(ts))) # update diagonal as 0 # if normalization != 'none': # for i in range(len(W)): # W[i,i] = 0 # first make all self-factor 0 assert W.dtype == "float64" # normalization section if normalization == 'source': # source mean the residence W = np.transpose(W) sW = np.sum(W, axis=1, keepdims=True) W = W / sW assert abs(np.sum(W[1, ]) - 1) < 0.0000000001 and W.dtype == "float64" elif normalization == 'destination': # # destination mean workplace sW = np.sum(W, axis=1) sW = sW.reshape((len(sW), 1)) W = W / sW elif normalization == 'pair': sW = W + np.transpose(W) sW = np.sum(sW) W = W / sW # by default, the output is the workplace-to-residence count matrix return W
def generatePOIfeature(gridLevel='ca'): """ generate POI features and write out to a file regionLevel could be "ca" or "tract" ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 'Outdoors & Recreation', 'College & Education', 'Nightlife', 'Professional', 'Shops', 'Event'] """ if gridLevel == 'ca': cas = Tract.createAllCAObjects() elif gridLevel == 'tract': cas = Tract.createAllTractObjects() ordKey = sorted(cas.keys()) gcn = np.zeros((len(cas), 3)) # check-in count, user count, and POI count gcat = {} with open('../data/all_POIs_chicago', 'r') as fin: POIs = pickle.load(fin) with open('category_hierarchy.pickle', 'r') as f2: poi_cat = pickle.load(f2) cnt = 0 for poi in POIs.values(): loc = Point(poi.location.lon, poi.location.lat) if poi.cat in poi_cat: cat = poi_cat[poi.cat] else: continue for key, grid in cas.items(): if grid.polygon.contains(loc): gcn[ordKey.index(key), 0] += poi.checkin_count gcn[ordKey.index(key), 1] += poi.user_count gcn[ordKey.index(key), 2] += 1 """ Build a two-level dictionary, first index by region id, then index by category id, finally, the value is number of POI under the category. """ if key in gcat: if cat in gcat[key]: gcat[key][cat] += 1 else: gcat[key][cat] = 1 else: gcat[key] = {} gcat[key][cat] = 1 # break the polygon loop cnt += 1 break s = 0 hi_catgy = [] for catdict in gcat.values(): hi_catgy += catdict.keys() for c in catdict.values(): s += c hi_catgy = list(set(hi_catgy)) print hi_catgy gdist = np.zeros((len(cas), len(hi_catgy))) for key, distDict in gcat.items(): for idx, cate in enumerate(hi_catgy): if cate in distDict: gdist[ordKey.index(key), idx] = distDict[cate] else: gdist[ordKey.index(key), idx] = 0 if gridLevel == 'ca': np.savetxt(here + "/POI_dist.csv", gdist, delimiter=",") np.savetxt(here + "/POI_cnt.csv", gcn, delimiter=",") elif gridLevel == 'tract': np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter=",") np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter=",") with open(here + "/POI_tract.pickle", 'w') as fout: pickle.dump(ordKey, fout) pickle.dump(gcat, fout)
def generate_transition_SocialLag(year = 2010, lehd_type=0, region='ca', leaveOut=-1, normalization='source'): """ Generate the spatial lag matrix from the transition flow connected CAs. 0 - #total jobs 1 - #jobs age under 29, 2 - #jobs age from 30 to 54, 3 - #jobs above 55, 4 - #jobs earning under $1250/month, 5 - #jobs earnings from $1251 to $3333/month, 6 - #jobs above $3333/month, 7 - #jobs in goods producing, 8 - #jobs in trade transportation, 9 - #jobs in other services """ if region == 'ca': ts = Tract.createAllCAObjects() fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year) elif region == 'tract': ts = Tract.createAllTractObjects() fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year) ordkey = sorted(ts.keys()) listIdx = {} fin = open(fn) for line in fin: ls = line.split(",") srcid = int(ls[0]) dstid = int(ls[1]) val = int(ls[2 + lehd_type]) if srcid in listIdx: listIdx[srcid][dstid] = val else: listIdx[srcid] = {} listIdx[srcid][dstid] = val fin.close() if leaveOut > 0: ordkey.remove(leaveOut) W = np.zeros( (len(ts),len(ts)) ) for srcid in ordkey: if srcid in listIdx: sdict = listIdx[srcid] if leaveOut in sdict: del sdict[leaveOut] for dstid, val in sdict.items(): W[ordkey.index(srcid)][ordkey.index(dstid)] = val else: W[ordkey.index(srcid)] = np.zeros( (1,len(ts)) ) # update diagonal as 0 # if normalization != 'none': # for i in range(len(W)): # W[i,i] = 0 # first make all self-factor 0 assert W.dtype == "float64" # normalization section if normalization == 'source': # source mean the residence W = np.transpose(W) sW = np.sum(W, axis=1, keepdims=True) W = W / sW assert abs( np.sum(W[1,]) - 1 ) < 0.0000000001 and W.dtype == "float64" elif normalization == 'destination': # # destination mean workplace sW = np.sum(W, axis=1) sW = sW.reshape((len(sW),1)) W = W / sW elif normalization == 'pair': sW = W + np.transpose(W) sW = np.sum(sW) W = W / sW # by default, the output is the workplace-to-residence count matrix return W
def generatePOIfeature(gridLevel = 'ca'): """ generate POI features and write out to a file regionLevel could be "ca" or "tract" ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 'Outdoors & Recreation', 'College & Education', 'Nightlife', 'Professional', 'Shops', 'Event'] """ if gridLevel == 'ca': cas = Tract.createAllCAObjects() elif gridLevel == 'tract': cas = Tract.createAllTractObjects() ordKey = sorted(cas.keys()) gcn = np.zeros((len(cas), 3)) # check-in count, user count, and POI count gcat = {} with open('../data/all_POIs_chicago', 'r') as fin: POIs = pickle.load(fin) with open('category_hierarchy.pickle', 'r') as f2: poi_cat = pickle.load(f2) cnt = 0 for poi in POIs.values(): loc = Point(poi.location.lon, poi.location.lat) if poi.cat in poi_cat: cat = poi_cat[poi.cat] else: continue for key, grid in cas.items(): if grid.polygon.contains(loc): gcn[ordKey.index(key),0] += poi.checkin_count gcn[ordKey.index(key),1] += poi.user_count gcn[ordKey.index(key),2] += 1 """ Build a two-level dictionary, first index by region id, then index by category id, finally, the value is number of POI under the category. """ if key in gcat: if cat in gcat[key]: gcat[key][cat] += 1 else: gcat[key][cat] = 1 else: gcat[key] = {} gcat[key][cat] = 1 # break the polygon loop cnt += 1 break s = 0 hi_catgy = [] for catdict in gcat.values(): hi_catgy += catdict.keys() for c in catdict.values(): s += c hi_catgy = list(set(hi_catgy)) print hi_catgy gdist = np.zeros( (len(cas), len(hi_catgy)) ) for key, distDict in gcat.items(): for idx, cate in enumerate(hi_catgy): if cate in distDict: gdist[ordKey.index(key), idx] = distDict[cate] else: gdist[ordKey.index(key), idx] = 0 if gridLevel == 'ca': np.savetxt(here + "/POI_dist.csv", gdist, delimiter="," ) np.savetxt(here + "/POI_cnt.csv", gcn, delimiter="," ) elif gridLevel == 'tract': np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter="," ) np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter="," )