Пример #1
0
def generateTaxiFlow(gridLevel='ca'):
    """
    Generate taxi flow and write it to a file
    
    This is slow to run
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()
    n = len(cas)
    TF = np.zeros((n, n))   # taxi flow matrix
    
    ordKey = sorted(cas.keys())
    
    cnt = 0
#    import os
#    fnames = os.listdir("../data/ChicagoTaxi/")
    fnames = ['201401-03.txt']
    
    for fname in fnames:
        print "Count taxi flow in {0}".format(fname)
        with open('../data/ChicagoTaxi/{0}'.format(fname), 'rU') as fin:
            reader = csv.reader(fin, delimiter='\t' )
            header = reader.next()
            for row in reader:
                # initialize points            
                start = Point(float(row[3]), float(row[4]))
                end = Point(float(row[5]), float(row[6]))
                
                sid = -1
                eid = -1
                for key, grid in cas.items():
                    """
                    grid key starts from 1
                    map the start/end point of trip into grids to get flow
                    """
                    if grid.polygon.contains(start):
                        sid = ordKey.index(key)
                    if grid.polygon.contains(end):
                        eid = ordKey.index(key)
                    if sid != -1 and eid != -1:
                        break
                
                TF[sid, eid] += 1
                cnt += 1
                if (cnt % 100000 == 0):
                    print "{0} trips have been added".format(cnt)
    if gridLevel == 'ca':
        np.savetxt(here + "/TF.csv", TF, delimiter="," )
    elif gridLevel == 'tract':
        np.savetxt(here + "/TF_tract.csv", TF, delimiter="," )
Пример #2
0
def generate_geographical_SpatialLag():
    """
    Generate the spatial lag from the geographically adjacent regions.
    """
    ts = Tract.createAllTractObjects()
    ordkey = sorted(ts)
    centers = [ts[k].polygon.centroid for k in ordkey]

    W = np.zeros((len(centers), len(centers)))
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if src != dst:
                W[i][j] = 1 / src.distance(dst)
    return W, ordkey
Пример #3
0
def generate_geographical_SpatialLag():
    """
    Generate the spatial lag from the geographically adjacent regions.
    """
    ts = Tract.createAllTractObjects()
    ordkey = sorted(ts)
    centers = [ ts[k].polygon.centroid for k in ordkey ]
    
    W = np.zeros( (len(centers), len(centers)) )
    for i, src in enumerate(centers):
        for j, dst in enumerate(centers):
            if src != dst:
                W[i][j] = 1 / src.distance(dst)
    return W, ordkey
Пример #4
0
def correlation_POIdist_crime():
    """
    we calculate the correlation between POI distribution and crime for each
    community area(CA).
    Within each CA, the crime count is number of crime in each tract.
    The POI count is number of POIs in each tract.
    """
    tracts = Tract.createAllTractObjects()
    ordkey = sorted(tracts.keys())
    CAs = {}
    for key, val in tracts.items():
        if val.CA not in CAs:
            CAs[val.CA] = [key]
        else:
            CAs[val.CA].append(key)
    
    Y = retrieve_crime_count(2010, col=['total'], region='tract')
    poi_dist = getFourSquarePOIDistribution(gridLevel='tract')
    
    
    Pearson = {}
    for cakey, calist in CAs.items():
        crime = []
        pois = []
        for tractkey in calist:
            crime.append(Y[tractkey])
            pois.append(poi_dist[ordkey.index(tractkey)])
        # calculate correlation
        pois = np.array(pois)
        crime = np.array(crime)
        pearson = []
        for i in range(pois.shape[1]):
            r = np.vstack( (pois[:,i], crime) )
            pearson.append( np.corrcoef(r)[0,1] )
            
        Pearson[cakey] = np.nan_to_num( pearson )

    P = []
    for key in range(1, 78):
        P.append(Pearson[key])
    
    np.savetxt("../R/poi_correlation_ca.csv", P, delimiter=",")
    return np.array(P)
Пример #5
0
def correlation_POI_crime(gridLevel='tract', poiRatio=False):
    """
    calculate correlation for different POI category
    """
    Y = retrieve_crime_count(2010, col=['total'], region=gridLevel)
    h, D = generate_corina_features(region='ca')
    popul = D[:,0].reshape(D.shape[0],1)
    poi_dist = getFourSquarePOIDistribution(gridLevel=gridLevel, useRatio=poiRatio)
    cate_label = ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
                'Outdoors & Recreation', 'College & Education', 'Nightlife', 
                'Professional', 'Shops', 'Event']
    
    if gridLevel == 'tract':
        tracts = Tract.createAllTractObjects()
        ordkey = sorted(tracts.keys())

        crime = []
        pois = []
        for tractkey in ordkey:
            crime.append(Y[tractkey])
            pois.append(poi_dist[ordkey.index(tractkey)])
        
        pois = np.array(pois)
        crime = np.array(crime)
    
        for i in range(pois.shape[1]):
            r = np.vstack( (pois[:,i], crime) )
            pcc = np.corrcoef(r)[0,1]
            print pcc
            
    elif gridLevel == 'ca':
        Y = np.divide(Y, popul) * 10000
        Y = Y.reshape( (len(Y),) )
        poi_dist = np.transpose(poi_dist)
        
        for i in range(poi_dist.shape[0]):
            poi = np.reshape(poi_dist[i,:], Y.shape )
            r, p = pearsonr(poi, Y)
            print cate_label[i], r, p
Пример #6
0
def generate_transition_SocialLag(year=2010,
                                  lehd_type=0,
                                  region='ca',
                                  leaveOut=-1,
                                  normalization='source'):
    """
    Generate the spatial lag matrix from the transition flow connected CAs.
    
    0 - #total jobs
    1 - #jobs age under 29,
    2 - #jobs age from 30 to 54, 
    3 - #jobs above 55, 
    4 - #jobs earning under $1250/month, 
    5 - #jobs earnings from $1251 to $3333/month, 
    6 - #jobs above $3333/month,
    7 - #jobs in goods producing, 
    8 - #jobs in trade transportation, 
    9 - #jobs in other services
    """

    if region == 'ca':
        ts = Tract.createAllCAObjects()
        fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year)
    elif region == 'tract':
        ts = Tract.createAllTractObjects()
        fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year)
    ordkey = sorted(ts.keys())

    listIdx = {}
    fin = open(fn)
    for line in fin:
        ls = line.split(",")
        srcid = int(ls[0][5:])
        dstid = int(ls[1][5:])
        val = int(ls[2 + lehd_type])
        if srcid in listIdx:
            listIdx[srcid][dstid] = val
        else:
            listIdx[srcid] = {}
            listIdx[srcid][dstid] = val
    fin.close()

    if leaveOut > 0:
        ordkey.remove(leaveOut)

    W = np.zeros((len(ts), len(ts)))
    for srcid in ordkey:
        if srcid in listIdx:
            sdict = listIdx[srcid]
            if leaveOut in sdict:
                del sdict[leaveOut]
            for dstid, val in sdict.items():
                W[ordkey.index(srcid)][ordkey.index(dstid)] = val
        else:
            W[ordkey.index(srcid)] = np.zeros((1, len(ts)))

    # update diagonal as 0


#    if normalization != 'none':
#        for i in range(len(W)):
#            W[i,i] = 0
# first make all self-factor 0
    assert W.dtype == "float64"

    # normalization section
    if normalization == 'source':
        # source mean the residence
        W = np.transpose(W)
        sW = np.sum(W, axis=1, keepdims=True)
        W = W / sW
        assert abs(np.sum(W[1, ]) - 1) < 0.0000000001 and W.dtype == "float64"
    elif normalization == 'destination':  #
        # destination mean workplace
        sW = np.sum(W, axis=1)
        sW = sW.reshape((len(sW), 1))
        W = W / sW
    elif normalization == 'pair':
        sW = W + np.transpose(W)
        sW = np.sum(sW)
        W = W / sW

    # by default, the output is the workplace-to-residence count matrix
    return W
Пример #7
0
def generatePOIfeature(gridLevel='ca'):
    """
    generate POI features and write out to a file
    
    regionLevel could be "ca" or "tract"
    
    ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
    'Outdoors & Recreation', 'College & Education', 'Nightlife', 
    'Professional', 'Shops', 'Event']
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()

    ordKey = sorted(cas.keys())

    gcn = np.zeros((len(cas), 3))  # check-in count, user count, and POI count
    gcat = {}

    with open('../data/all_POIs_chicago', 'r') as fin:
        POIs = pickle.load(fin)

    with open('category_hierarchy.pickle', 'r') as f2:
        poi_cat = pickle.load(f2)

    cnt = 0
    for poi in POIs.values():
        loc = Point(poi.location.lon, poi.location.lat)
        if poi.cat in poi_cat:
            cat = poi_cat[poi.cat]
        else:
            continue

        for key, grid in cas.items():
            if grid.polygon.contains(loc):
                gcn[ordKey.index(key), 0] += poi.checkin_count
                gcn[ordKey.index(key), 1] += poi.user_count
                gcn[ordKey.index(key), 2] += 1
                """
                Build a two-level dictionary,
                first index by region id,
                then index by category id,
                finally, the value is number of POI under the category.
                """
                if key in gcat:
                    if cat in gcat[key]:
                        gcat[key][cat] += 1
                    else:
                        gcat[key][cat] = 1
                else:
                    gcat[key] = {}
                    gcat[key][cat] = 1

                # break the polygon loop
                cnt += 1
                break

    s = 0
    hi_catgy = []
    for catdict in gcat.values():
        hi_catgy += catdict.keys()
        for c in catdict.values():
            s += c

    hi_catgy = list(set(hi_catgy))
    print hi_catgy

    gdist = np.zeros((len(cas), len(hi_catgy)))
    for key, distDict in gcat.items():
        for idx, cate in enumerate(hi_catgy):
            if cate in distDict:
                gdist[ordKey.index(key), idx] = distDict[cate]
            else:
                gdist[ordKey.index(key), idx] = 0

    if gridLevel == 'ca':
        np.savetxt(here + "/POI_dist.csv", gdist, delimiter=",")
        np.savetxt(here + "/POI_cnt.csv", gcn, delimiter=",")
    elif gridLevel == 'tract':
        np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter=",")
        np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter=",")
        with open(here + "/POI_tract.pickle", 'w') as fout:
            pickle.dump(ordKey, fout)
            pickle.dump(gcat, fout)
Пример #8
0
def generatePOIfeature(gridLevel = 'ca'):
    """
    generate POI features and write out to a file
    
    regionLevel could be "ca" or "tract"
    
    ['Food', 'Residence', 'Travel', 'Arts & Entertainment', 
    'Outdoors & Recreation', 'College & Education', 'Nightlife', 
    'Professional', 'Shops', 'Event']
    """
    if gridLevel == 'ca':
        cas = Tract.createAllCAObjects()
    elif gridLevel == 'tract':
        cas = Tract.createAllTractObjects()

    ordKey = sorted(cas.keys())
    
    gcn = np.zeros((len(cas), 3))   # check-in count, user count, and POI count
    gcat = {}
    
    with open('../data/all_POIs_chicago', 'r') as fin:
        POIs = pickle.load(fin)
        
    with open('category_hierarchy.pickle', 'r') as f2:
        poi_cat = pickle.load(f2)
    
    cnt = 0
    for poi in POIs.values():
        loc = Point(poi.location.lon, poi.location.lat)
        if poi.cat in poi_cat:
            cat = poi_cat[poi.cat]
        else:
            continue
        
        for key, grid in cas.items():
            if grid.polygon.contains(loc):
                gcn[ordKey.index(key),0] += poi.checkin_count
                gcn[ordKey.index(key),1] += poi.user_count
                gcn[ordKey.index(key),2] += 1
                """
                Build a two-level dictionary,
                first index by region id,
                then index by category id,
                finally, the value is number of POI under the category.
                """
                if key in gcat:
                    if cat in gcat[key]:
                        gcat[key][cat] += 1
                    else:
                        gcat[key][cat] = 1
                else:
                    gcat[key] = {}
                    gcat[key][cat] = 1
                    
                # break the polygon loop
                cnt += 1
                break
    
    s = 0
    hi_catgy = []
    for catdict in gcat.values():
        hi_catgy += catdict.keys()
        for c in catdict.values():
            s += c
            
    hi_catgy = list(set(hi_catgy))
    print hi_catgy
    
    
    gdist = np.zeros( (len(cas), len(hi_catgy)) )
    for key, distDict in gcat.items():
        for idx, cate in enumerate(hi_catgy):
            if cate in distDict:            
                gdist[ordKey.index(key), idx] = distDict[cate]
            else:
                gdist[ordKey.index(key), idx] = 0
                
    if gridLevel == 'ca':
        np.savetxt(here + "/POI_dist.csv", gdist, delimiter="," )
        np.savetxt(here + "/POI_cnt.csv", gcn, delimiter="," )
    elif gridLevel == 'tract':
        np.savetxt(here + "/POI_dist_tract.csv", gdist, delimiter="," )
        np.savetxt(here + "/POI_cnt_tract.csv", gcn, delimiter="," )
Пример #9
0
def generate_transition_SocialLag(year = 2010, lehd_type=0, region='ca', leaveOut=-1, normalization='source'):
    """
    Generate the spatial lag matrix from the transition flow connected CAs.
    
    0 - #total jobs
    1 - #jobs age under 29,
    2 - #jobs age from 30 to 54, 
    3 - #jobs above 55, 
    4 - #jobs earning under $1250/month, 
    5 - #jobs earnings from $1251 to $3333/month, 
    6 - #jobs above $3333/month,
    7 - #jobs in goods producing, 
    8 - #jobs in trade transportation, 
    9 - #jobs in other services
    """
    
    if region == 'ca':
        ts = Tract.createAllCAObjects()
        fn = here + '/../data/chicago_ca_od_{0}.csv'.format(year)
    elif region == 'tract':
        ts = Tract.createAllTractObjects()
        fn = here + '/../data/chicago_od_tract_{0}.csv'.format(year)
    ordkey = sorted(ts.keys())
    
    
    listIdx = {}
    fin = open(fn)
    for line in fin:
        ls = line.split(",")
        srcid = int(ls[0])
        dstid = int(ls[1])
        val = int(ls[2 + lehd_type])
        if srcid in listIdx:
            listIdx[srcid][dstid] = val
        else:
            listIdx[srcid] = {}
            listIdx[srcid][dstid] = val                            
    fin.close()

    if leaveOut > 0:
        ordkey.remove(leaveOut)
    
    W = np.zeros( (len(ts),len(ts)) )
    for srcid in ordkey:
        if srcid in listIdx:
            sdict = listIdx[srcid]
            if leaveOut in sdict:
                del sdict[leaveOut]
            for dstid, val in sdict.items():
                W[ordkey.index(srcid)][ordkey.index(dstid)] = val
        else:
            W[ordkey.index(srcid)] = np.zeros( (1,len(ts)) )
            
    
    
    # update diagonal as 0
#    if normalization != 'none':
#        for i in range(len(W)):
#            W[i,i] = 0
    # first make all self-factor 0
    assert W.dtype == "float64"
        
    # normalization section
    if normalization == 'source':
        # source mean the residence
        W = np.transpose(W)    
        sW = np.sum(W, axis=1, keepdims=True)
        W = W / sW
        assert abs( np.sum(W[1,]) - 1 ) < 0.0000000001 and W.dtype == "float64"
    elif normalization == 'destination': # 
        # destination mean workplace
        sW = np.sum(W, axis=1)
        sW = sW.reshape((len(sW),1))
        W = W / sW
    elif normalization == 'pair':
        sW = W + np.transpose(W)
        sW = np.sum(sW)
        W = W / sW
    
    # by default, the output is the workplace-to-residence count matrix
    return W