Пример #1
0
def generateTaxiFlow(gridLevel='ca'):
    """
    Generate taxi flow and write it to a file
    
    This is slow to run
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()
    n = len(cas)
    TF = np.zeros((n, n))   # taxi flow matrix
    
    ordKey = sorted(cas.keys())
    
    cnt = 0
#    import os
#    fnames = os.listdir("../data/ChicagoTaxi/")
    fnames = ['201401-03.txt']
    
    for fname in fnames:
        print "Count taxi flow in {0}".format(fname)
        with open('../data/ChicagoTaxi/{0}'.format(fname), 'rU') as fin:
            reader = csv.reader(fin, delimiter='\t' )
            header = reader.next()
            for row in reader:
                # initialize points            
                start = Point(float(row[3]), float(row[4]))
                end = Point(float(row[5]), float(row[6]))
                
                sid = -1
                eid = -1
                for key, grid in cas.items():
                    """
                    grid key starts from 1
                    map the start/end point of trip into grids to get flow
                    """
                    if grid.polygon.contains(start):
                        sid = ordKey.index(key)
                    if grid.polygon.contains(end):
                        eid = ordKey.index(key)
                    if sid != -1 and eid != -1:
                        break
                
                TF[sid, eid] += 1
                cnt += 1
                if (cnt % 100000 == 0):
                    print "{0} trips have been added".format(cnt)
    if gridLevel == 'ca':
        np.savetxt(here + "/TF.csv", TF, delimiter="," )
    elif gridLevel == 'tract':
        np.savetxt(here + "/TF_tract.csv", TF, delimiter="," )
Пример #2
0
def generate_geographical_SpatialLag_ca(knearest=True, leaveOut=-1):
    """
    Generate the distance matrix for CA pairs.
    
    If knearest is true, then select the 6-nearest neighboring CAs.
    Else, return the distance to all other CAs.

    leaveOut will select the CA and remove it. take value from 1 to 77
    """
    cas = Tract.createAllCAObjects()
    centers = []
    iset = range(1, 78)
    if leaveOut > 0:
        iset.remove(leaveOut)
    for i in iset:
        centers.append(cas[i].polygon.centroid)

    W = np.zeros((len(iset), len(iset)))
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if src != dst:
                W[i][j] = 1 / src.distance(dst)

        # find n-largest (n=6)
        if knearest == True:
            threshold = heapq.nlargest(6, W[i, :])[-1]
            for j in range(len(W[i, :])):
                W[i][j] = 0 if W[i][j] < threshold else W[i][j]
    return W
Пример #3
0
def get_centroid_ca():
    cas = Tract.createAllCAObjects()
    centers = []
    for i in range(1, 78):
        ctd = cas[i].polygon.centroid
        centers.append([ctd.x, ctd.y])
    return centers
Пример #4
0
def generate_geographical_SpatialLag_ca(knearest=True, leaveOut=-1):
    """
    Generate the distance matrix for CA pairs.
    
    If knearest is true, then select the 6-nearest neighboring CAs.
    Else, return the distance to all other CAs.

    leaveOut will select the CA and remove it. take value from 1 to 77
    """
    
    
    cas = Tract.createAllCAObjects()
    centers = []
    iset = range(1, 78)
    if leaveOut > 0:
        iset.remove(leaveOut)
    for i in iset:
        centers.append(cas[i].polygon.centroid)
    
    W = np.zeros( (len(iset),len(iset)) )
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if src != dst:
                W[i][j] = 1 / src.distance(dst)
                
        # find n-largest (n=6)
        if knearest == True:
            threshold = heapq.nlargest(6, W[i,:])[-1]
            for j in range(len(W[i,:])):
                W[i][j] = 0 if W[i][j] < threshold else W[i][j]
    return W
Пример #5
0
def generate_geographical_SpatialLag():
    """
    Generate the spatial lag from the geographically adjacent regions.
    """
    ts = Tract.createAllTractObjects()
    ordkey = sorted(ts)
    centers = [ ts[k].polygon.centroid for k in ordkey ]
    
    W = np.zeros( (len(centers), len(centers)) )
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if src != dst:
                W[i][j] = 1 / src.distance(dst)
    return W, ordkey
Пример #6
0
def generate_geographical_SpatialLag():
    """
    Generate the spatial lag from the geographically adjacent regions.
    """
    ts = Tract.createAllTractObjects()
    ordkey = sorted(ts)
    centers = [ts[k].polygon.centroid for k in ordkey]

    W = np.zeros((len(centers), len(centers)))
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if src != dst:
                W[i][j] = 1 / src.distance(dst)
    return W, ordkey
Пример #7
0
def generate_GWR_weight(h = 1):
    """
    Generate the GWR weighting matrix with exponential function.
    """
    cas = Tract.createAllCAObjects()
    centers = []
    for i in range(1, 78):
        centers.append(cas[i].polygon.centroid)
    
    gamma = np.ones((len(centers), len(centers)))
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if i != j:
                gamma[i][j] = np.exp(-0.5 * src.distance(dst)**2 / h**2)
    return gamma
Пример #8
0
def generate_GWR_weight(h=1):
    """
    Generate the GWR weighting matrix with exponential function.
    """
    cas = Tract.createAllCAObjects()
    centers = []
    for i in range(1, 78):
        centers.append(cas[i].polygon.centroid)

    gamma = np.ones((len(centers), len(centers)))
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if i != j:
                gamma[i][j] = np.exp(-0.5 * src.distance(dst)**2 / h**2)
    return gamma
Пример #9
0
def correlation_POIdist_crime():
    """
    we calculate the correlation between POI distribution and crime for each
    community area(CA).
    Within each CA, the crime count is number of crime in each tract.
    The POI count is number of POIs in each tract.
    """
    tracts = Tract.createAllTractObjects()
    ordkey = sorted(tracts.keys())
    CAs = {}
    for key, val in tracts.items():
        if val.CA not in CAs:
            CAs[val.CA] = [key]
        else:
            CAs[val.CA].append(key)
    
    Y = retrieve_crime_count(2010, col=['total'], region='tract')
    poi_dist = getFourSquarePOIDistribution(gridLevel='tract')
    
    
    Pearson = {}
    for cakey, calist in CAs.items():
        crime = []
        pois = []
        for tractkey in calist:
            crime.append(Y[tractkey])
            pois.append(poi_dist[ordkey.index(tractkey)])
        # calculate correlation
        pois = np.array(pois)
        crime = np.array(crime)
        pearson = []
        for i in range(pois.shape[1]):
            r = np.vstack( (pois[:,i], crime) )
            pearson.append( np.corrcoef(r)[0,1] )
            
        Pearson[cakey] = np.nan_to_num( pearson )

    P = []
    for key in range(1, 78):
        P.append(Pearson[key])
    
    np.savetxt("../R/poi_correlation_ca.csv", P, delimiter=",")
    return np.array(P)
 def visualize_prediction_error(self, er, Y, title):
     cas = Tract.createAllCAObjects()
     import matplotlib.pyplot as plt
     import descartes
     fig = plt.figure()
     ax = fig.add_subplot(111)
     for k in cas:
         re = er[k - 1] / Y[k - 1]
         if re > 0.4:
             c = 'r'
         elif re < -0.4:
             c = 'b'
         else:
             c = 'w'
         cak = cas[k].polygon
         ax.add_patch(descartes.PolygonPatch(cak, fc=c))
         ax.annotate(str(k), [cak.centroid.x, cak.centroid.y])
     ax.axis('equal')
     ax.set_title(title)
     fig.show()
 def visualize_prediction_error(self, er, Y, title):
     cas = Tract.createAllCAObjects()
     import matplotlib.pyplot as plt
     import descartes
     fig = plt.figure()
     ax = fig.add_subplot(111)
     for k in cas:
         re = er[k-1] / Y[k-1]
         if re > 0.4:
             c = 'r'
         elif re < -0.4:
             c = 'b'
         else:
             c = 'w'
         cak = cas[k].polygon
         ax.add_patch(descartes.PolygonPatch(cak, fc=c))
         ax.annotate(str(k), [cak.centroid.x, cak.centroid.y])
     ax.axis('equal')
     ax.set_title(title)
     fig.show()
def CA_clustering_with_embedding():
    ge = get_graph_embedding_features("geo_all.txt")
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=6, max_iter=100).fit(ge)
    for idx, lab in enumerate(kmeans.labels_):
        print idx+1, lab
    
    colorMaps = ['blue', 'red', 'g', 'c', 'y', 'm', 'k', 'w']
    cas = Tract.createAllCAObjects()
    import matplotlib.pyplot as plt
    import descartes
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for k in cas:
        cak = cas[k].polygon
        ax.add_patch(descartes.PolygonPatch(cak, fc=colorMaps[kmeans.labels_[k-1]]))
        ax.annotate(str(k), [cak.centroid.x, cak.centroid.y])
    ax.axis('equal')
    fig.show()
    
    return kmeans, cas
Пример #13
0
def CA_clustering_with_embedding():
    ge = get_graph_embedding_features("geo_all.txt")
    from sklearn.cluster import KMeans
    kmeans = KMeans(n_clusters=4, max_iter=100).fit(ge)
    for idx, lab in enumerate(kmeans.labels_):
        print idx + 1, lab

    colorMaps = ['blue', 'red', 'g', 'c', 'y', 'm', 'k', 'w']
    cas = Tract.createAllCAObjects()
    import matplotlib.pyplot as plt
    import descartes
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for k in cas:
        cak = cas[k].polygon
        ax.add_patch(
            descartes.PolygonPatch(cak, fc=colorMaps[kmeans.labels_[k - 1]]))
        ax.annotate(str(k), [cak.centroid.x, cak.centroid.y])
    ax.axis('equal')
    fig.show()

    return kmeans, cas
Пример #14
0
def correlation_POI_crime(gridLevel='tract', poiRatio=False):
    """
    calculate correlation for different POI category
    """
    Y = retrieve_crime_count(2010, col=['total'], region=gridLevel)
    h, D = generate_corina_features(region='ca')
    popul = D[:,0].reshape(D.shape[0],1)
    poi_dist = getFourSquarePOIDistribution(gridLevel=gridLevel, useRatio=poiRatio)
    cate_label = ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
                'Outdoors & Recreation', 'College & Education', 'Nightlife', 
                'Professional', 'Shops', 'Event']
    
    if gridLevel == 'tract':
        tracts = Tract.createAllTractObjects()
        ordkey = sorted(tracts.keys())

        crime = []
        pois = []
        for tractkey in ordkey:
            crime.append(Y[tractkey])
            pois.append(poi_dist[ordkey.index(tractkey)])
        
        pois = np.array(pois)
        crime = np.array(crime)
    
        for i in range(pois.shape[1]):
            r = np.vstack( (pois[:,i], crime) )
            pcc = np.corrcoef(r)[0,1]
            print pcc
            
    elif gridLevel == 'ca':
        Y = np.divide(Y, popul) * 10000
        Y = Y.reshape( (len(Y),) )
        poi_dist = np.transpose(poi_dist)
        
        for i in range(poi_dist.shape[0]):
            poi = np.reshape(poi_dist[i,:], Y.shape )
            r, p = pearsonr(poi, Y)
            print cate_label[i], r, p
Пример #15
0
def generate_transition_SocialLag(year=2010,
                                  lehd_type=0,
                                  region='ca',
                                  leaveOut=-1,
                                  normalization='source'):
    """
    Generate the spatial lag matrix from the transition flow connected CAs.
    
    0 - #total jobs
    1 - #jobs age under 29,
    2 - #jobs age from 30 to 54, 
    3 - #jobs above 55, 
    4 - #jobs earning under $1250/month, 
    5 - #jobs earnings from $1251 to $3333/month, 
    6 - #jobs above $3333/month,
    7 - #jobs in goods producing, 
    8 - #jobs in trade transportation, 
    9 - #jobs in other services
    """

    if region == 'ca':
        ts = Tract.createAllCAObjects()
        fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year)
    elif region == 'tract':
        ts = Tract.createAllTractObjects()
        fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year)
    ordkey = sorted(ts.keys())

    listIdx = {}
    fin = open(fn)
    for line in fin:
        ls = line.split(",")
        srcid = int(ls[0][5:])
        dstid = int(ls[1][5:])
        val = int(ls[2 + lehd_type])
        if srcid in listIdx:
            listIdx[srcid][dstid] = val
        else:
            listIdx[srcid] = {}
            listIdx[srcid][dstid] = val
    fin.close()

    if leaveOut > 0:
        ordkey.remove(leaveOut)

    W = np.zeros((len(ts), len(ts)))
    for srcid in ordkey:
        if srcid in listIdx:
            sdict = listIdx[srcid]
            if leaveOut in sdict:
                del sdict[leaveOut]
            for dstid, val in sdict.items():
                W[ordkey.index(srcid)][ordkey.index(dstid)] = val
        else:
            W[ordkey.index(srcid)] = np.zeros((1, len(ts)))

    # update diagonal as 0


#    if normalization != 'none':
#        for i in range(len(W)):
#            W[i,i] = 0
# first make all self-factor 0
    assert W.dtype == "float64"

    # normalization section
    if normalization == 'source':
        # source mean the residence
        W = np.transpose(W)
        sW = np.sum(W, axis=1, keepdims=True)
        W = W / sW
        assert abs(np.sum(W[1, ]) - 1) < 0.0000000001 and W.dtype == "float64"
    elif normalization == 'destination':  #
        # destination mean workplace
        sW = np.sum(W, axis=1)
        sW = sW.reshape((len(sW), 1))
        W = W / sW
    elif normalization == 'pair':
        sW = W + np.transpose(W)
        sW = np.sum(sW)
        W = W / sW

    # by default, the output is the workplace-to-residence count matrix
    return W
Пример #16
0
def generatePOIfeature(gridLevel='ca'):
    """
    generate POI features and write out to a file
    
    regionLevel could be "ca" or "tract"
    
    ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
    'Outdoors & Recreation', 'College & Education', 'Nightlife', 
    'Professional', 'Shops', 'Event']
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()

    ordKey = sorted(cas.keys())

    gcn = np.zeros((len(cas), 3))  # check-in count, user count, and POI count
    gcat = {}

    with open('../data/all_POIs_chicago', 'r') as fin:
        POIs = pickle.load(fin)

    with open('category_hierarchy.pickle', 'r') as f2:
        poi_cat = pickle.load(f2)

    cnt = 0
    for poi in POIs.values():
        loc = Point(poi.location.lon, poi.location.lat)
        if poi.cat in poi_cat:
            cat = poi_cat[poi.cat]
        else:
            continue

        for key, grid in cas.items():
            if grid.polygon.contains(loc):
                gcn[ordKey.index(key), 0] += poi.checkin_count
                gcn[ordKey.index(key), 1] += poi.user_count
                gcn[ordKey.index(key), 2] += 1
                """
                Build a two-level dictionary,
                first index by region id,
                then index by category id,
                finally, the value is number of POI under the category.
                """
                if key in gcat:
                    if cat in gcat[key]:
                        gcat[key][cat] += 1
                    else:
                        gcat[key][cat] = 1
                else:
                    gcat[key] = {}
                    gcat[key][cat] = 1

                # break the polygon loop
                cnt += 1
                break

    s = 0
    hi_catgy = []
    for catdict in gcat.values():
        hi_catgy += catdict.keys()
        for c in catdict.values():
            s += c

    hi_catgy = list(set(hi_catgy))
    print hi_catgy

    gdist = np.zeros((len(cas), len(hi_catgy)))
    for key, distDict in gcat.items():
        for idx, cate in enumerate(hi_catgy):
            if cate in distDict:
                gdist[ordKey.index(key), idx] = distDict[cate]
            else:
                gdist[ordKey.index(key), idx] = 0

    if gridLevel == 'ca':
        np.savetxt(here + "/POI_dist.csv", gdist, delimiter=",")
        np.savetxt(here + "/POI_cnt.csv", gcn, delimiter=",")
    elif gridLevel == 'tract':
        np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter=",")
        np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter=",")
        with open(here + "/POI_tract.pickle", 'w') as fout:
            pickle.dump(ordKey, fout)
            pickle.dump(gcat, fout)
Пример #17
0
def generate_transition_SocialLag(year = 2010, lehd_type=0, region='ca', leaveOut=-1, normalization='source'):
    """
    Generate the spatial lag matrix from the transition flow connected CAs.
    
    0 - #total jobs
    1 - #jobs age under 29,
    2 - #jobs age from 30 to 54, 
    3 - #jobs above 55, 
    4 - #jobs earning under $1250/month, 
    5 - #jobs earnings from $1251 to $3333/month, 
    6 - #jobs above $3333/month,
    7 - #jobs in goods producing, 
    8 - #jobs in trade transportation, 
    9 - #jobs in other services
    """
    
    if region == 'ca':
        ts = Tract.createAllCAObjects()
        fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year)
    elif region == 'tract':
        ts = Tract.createAllTractObjects()
        fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year)
    ordkey = sorted(ts.keys())
    
    
    listIdx = {}
    fin = open(fn)
    for line in fin:
        ls = line.split(",")
        srcid = int(ls[0])
        dstid = int(ls[1])
        val = int(ls[2 + lehd_type])
        if srcid in listIdx:
            listIdx[srcid][dstid] = val
        else:
            listIdx[srcid] = {}
            listIdx[srcid][dstid] = val                            
    fin.close()

    if leaveOut > 0:
        ordkey.remove(leaveOut)
    
    W = np.zeros( (len(ts),len(ts)) )
    for srcid in ordkey:
        if srcid in listIdx:
            sdict = listIdx[srcid]
            if leaveOut in sdict:
                del sdict[leaveOut]
            for dstid, val in sdict.items():
                W[ordkey.index(srcid)][ordkey.index(dstid)] = val
        else:
            W[ordkey.index(srcid)] = np.zeros( (1,len(ts)) )
            
    
    
    # update diagonal as 0
#    if normalization != 'none':
#        for i in range(len(W)):
#            W[i,i] = 0
    # first make all self-factor 0
    assert W.dtype == "float64"
        
    # normalization section
    if normalization == 'source':
        # source mean the residence
        W = np.transpose(W)    
        sW = np.sum(W, axis=1, keepdims=True)
        W = W / sW
        assert abs( np.sum(W[1,]) - 1 ) < 0.0000000001 and W.dtype == "float64"
    elif normalization == 'destination': # 
        # destination mean workplace
        sW = np.sum(W, axis=1)
        sW = sW.reshape((len(sW),1))
        W = W / sW
    elif normalization == 'pair':
        sW = W + np.transpose(W)
        sW = np.sum(sW)
        W = W / sW
    
    # by default, the output is the workplace-to-residence count matrix
    return W
Пример #18
0
def generatePOIfeature(gridLevel = 'ca'):
    """
    generate POI features and write out to a file
    
    regionLevel could be "ca" or "tract"
    
    ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
    'Outdoors & Recreation', 'College & Education', 'Nightlife', 
    'Professional', 'Shops', 'Event']
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()

    ordKey = sorted(cas.keys())
    
    gcn = np.zeros((len(cas), 3))   # check-in count, user count, and POI count
    gcat = {}
    
    with open('../data/all_POIs_chicago', 'r') as fin:
        POIs = pickle.load(fin)
        
    with open('category_hierarchy.pickle', 'r') as f2:
        poi_cat = pickle.load(f2)
    
    cnt = 0
    for poi in POIs.values():
        loc = Point(poi.location.lon, poi.location.lat)
        if poi.cat in poi_cat:
            cat = poi_cat[poi.cat]
        else:
            continue
        
        for key, grid in cas.items():
            if grid.polygon.contains(loc):
                gcn[ordKey.index(key),0] += poi.checkin_count
                gcn[ordKey.index(key),1] += poi.user_count
                gcn[ordKey.index(key),2] += 1
                """
                Build a two-level dictionary,
                first index by region id,
                then index by category id,
                finally, the value is number of POI under the category.
                """
                if key in gcat:
                    if cat in gcat[key]:
                        gcat[key][cat] += 1
                    else:
                        gcat[key][cat] = 1
                else:
                    gcat[key] = {}
                    gcat[key][cat] = 1
                    
                # break the polygon loop
                cnt += 1
                break
    
    s = 0
    hi_catgy = []
    for catdict in gcat.values():
        hi_catgy += catdict.keys()
        for c in catdict.values():
            s += c
            
    hi_catgy = list(set(hi_catgy))
    print hi_catgy
    
    
    gdist = np.zeros( (len(cas), len(hi_catgy)) )
    for key, distDict in gcat.items():
        for idx, cate in enumerate(hi_catgy):
            if cate in distDict:            
                gdist[ordKey.index(key), idx] = distDict[cate]
            else:
                gdist[ordKey.index(key), idx] = 0
                
    if gridLevel == 'ca':
        np.savetxt(here + "/POI_dist.csv", gdist, delimiter="," )
        np.savetxt(here + "/POI_cnt.csv", gcn, delimiter="," )
    elif gridLevel == 'tract':
        np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter="," )
        np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter="," )