Пример #1
0
def main_evaluate_different_years(year):
    import pickle
    Y, D, P, Tf, Gd = extract_raw_samples(year, crime_t=['total'])
    
    # use hourly crime rate as label
#    Yh = pickle.load(open("chicago-hourly-crime-{0}.pickle".format(year)))
#    Yh = Yh / D[:,0] * 10000

    # use average income as label
#    header, income = retrieve_income_features()
#    Yh = np.repeat(income[:,0,None], 24, axis=1)
#    Yh = Yh.T
    
    # use average house price as label
    Yh = retrieve_averge_house_price()
    Yh = np.repeat(Yh[:,None], 24, axis=1)
    Yh = Yh.T
    
    assert Yh.shape == (24, N)
    MAE =[]
    MRE = []
    for h in range(24):
        Tf = getTaxiFlow(filename="/taxi-CA-h{0}.matrix".format(h))
        mae, mre = leaveOneOut_error(Yh[h,:].reshape((N,1)), D, P, Tf, Yh[h,:].reshape((N,1)), Gd, 
                                     Yh[h,:].reshape((N,1)), features=['demo', 'poi'],
                                       taxi_norm="bydestination")
        print h, mae, mre
        MAE.append(mae)
        MRE.append(mre)
    print year, h, np.mean(MAE), np.mean(MRE)
    with open("kdd16-eval-{0}.pickle".format(year), "w") as fout:
        pickle.dump(MAE, fout)
        pickle.dump(MRE, fout)
Пример #2
0
def NB_coefficients(year=2010):
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=year)
    C = generate_corina_features()
    D = C[1]

    popul = C[1][:,0].reshape(C[1].shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 )
    mms = MinMaxScaler(copy=False)
    mms.fit(f)
    mms.transform(f)
    header = C[0] + [ 'spatiallag', 'taxiflow'] + \
        ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                       'POI outdoors recreation', 'POI education', 'POI nightlife', 
                       'POI professional', 'POI shops', 'POI event']
    df = pd.DataFrame(f, columns=header)
    
    np.savetxt("Y.csv", Y, delimiter=",")
    df.to_csv("f.csv", sep=",", index=False)
    
    # NB permute
    nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] )
    print nbres
    
    ls = nbres.strip().split(" ")
    coef = [float(e) for e in ls]
    print coef
    return coef, header
Пример #3
0
def leaveOneOut_Input_v4(leaveOut):
    """
    Generate observation matrix and vectors
    Y, F

    Those observations are trimed for the leave-one-out evaluation. Therefore, the leaveOut 
    indicates the CA id to be left out, ranging from 1-77
    """
    des, X = generate_corina_features('ca')
    X = np.delete(X, leaveOut - 1, 0)
    popul = X[:, 0].reshape(X.shape[0], 1)
    pvt = X[:, 2]  # poverty index of each CA

    #    poi_cnt = getFourSquareCount(leaveOut)
    #    poi_cnt = np.divide(poi_cnt, popul) * 10000

    poi_dist = getFourSquarePOIDistribution(leaveOut)
    poi_dist = np.divide(poi_dist, popul) * 10000

    F_dist = generate_geographical_SpatialLag_ca(leaveOut=leaveOut)
    F_flow = generate_transition_SocialLag(year=2010,
                                           lehd_type=0,
                                           region='ca',
                                           leaveOut=leaveOut)
    F_taxi = getTaxiFlow(leaveOut=leaveOut)

    Y = retrieve_crime_count(year=2010, col=['total'], region='ca')
    Y = np.delete(Y, leaveOut - 1, 0)
    Y = np.divide(Y, popul) * 10000

    F = []
    n = Y.size
    Yd = []
    for i in range(n):
        for j in range(n):
            if i != j:
                wij = np.array([
                    F_dist[i, j],
                    actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i, j],
                    F_taxi[i, j]
                ])
                #                fij = np.concatenate( (X[i], poi_dist[i],  wij * Y[j][0]), 0)
                fij = np.concatenate((X[i], wij * Y[j][0]), 0)
                F.append(fij)
                Yd.append(Y[i])
    F = np.array(F)
    np.append(F, np.ones((F.shape[0], 1)), axis=1)
    Yd = np.array(Yd)
    Yd.resize((Yd.size, 1))

    return Yd, F
Пример #4
0
def leaveOneOut_Input_v4( leaveOut ):
    """
    Generate observation matrix and vectors
    Y, F

    Those observations are trimed for the leave-one-out evaluation. Therefore, the leaveOut 
    indicates the CA id to be left out, ranging from 1-77
    """
    des, X = generate_corina_features('ca')
    X = np.delete(X, leaveOut-1, 0)
    popul = X[:,0].reshape(X.shape[0],1)
    pvt = X[:,2]    # poverty index of each CA
    
#    poi_cnt = getFourSquareCount(leaveOut)
#    poi_cnt = np.divide(poi_cnt, popul) * 10000
    
    poi_dist = getFourSquarePOIDistribution(leaveOut)
    poi_dist = np.divide(poi_dist, popul) * 10000
    
    F_dist = generate_geographical_SpatialLag_ca( leaveOut=leaveOut )
    F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca', leaveOut=leaveOut)
    F_taxi = getTaxiFlow(leaveOut = leaveOut)
    
    
    Y = retrieve_crime_count(year=2010, col=['total'], region='ca')
    Y = np.delete(Y, leaveOut-1, 0)
    Y = np.divide(Y, popul) * 10000
    
    F = []
    n = Y.size
    Yd = []
    for i in range(n):
        for j in range(n):
            if i != j:
                wij = np.array( [F_dist[i,j], 
                                actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i,j],
                                F_taxi[i,j] ])
#                fij = np.concatenate( (X[i], poi_dist[i],  wij * Y[j][0]), 0)
                fij = np.concatenate( (X[i],   wij * Y[j][0]), 0)
                F.append(fij)
                Yd.append(Y[i])
    F = np.array(F)
    np.append(F, np.ones( (F.shape[0], 1) ), axis=1)
    Yd = np.array(Yd)
    Yd.resize( (Yd.size, 1) )
    
    
    return Yd, F
Пример #5
0
def correlation_taxiflow_crime(flowPercentage=True, crimeRate=True):
    """
    correlation between taxi flow and crime
    """
    s = getTaxiFlow(usePercentage=flowPercentage)
    Y = retrieve_crime_count(2010, region='ca')
    if crimeRate:
        h, D = generate_corina_features(region='ca')
        popul = D[:,0].reshape(D.shape[0],1)
        Y = np.divide(Y, popul) * 10000
    
    f1 = np.dot(s, Y)
    r = np.hstack( (f1, Y) )
    r = np.transpose(r)
    pcc = np.corrcoef(r)
    print pcc
Пример #6
0
def generateInput_v4(fout=False):
    """
    Generate complete observation matrix
    """
    des, X = generate_corina_features('ca')
    pvt = X[:, 2]  # poverty index of each CA
    popul = X[:, 0].reshape(X.shape[0], 1)

    #    poi_cnt = getFourSquareCount()
    #    poi_cnt = np.divide(poi_cnt, popul) * 10000

    poi_dist = getFourSquarePOIDistribution()
    poi_dist = np.divide(poi_dist, popul) * 10000

    F_dist = generate_geographical_SpatialLag_ca()
    F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca')
    F_taxi = getTaxiFlow()

    Y = retrieve_crime_count(year=2010, col=['total'], region='ca')
    Y = np.divide(Y, popul) * 10000

    F = []
    n = Y.size
    for i in range(n):
        for j in range(n):
            if i != j:
                wij = np.array([
                    F_dist[i, j],
                    actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i, j],
                    F_taxi[i, j]
                ])
                #                fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]) , 0)
                fij = np.concatenate((X[i], wij * Y[j, 0]), 0)
                F.append(fij)
    F = np.array(F)
    np.append(F, np.ones((F.shape[0], 1)), axis=1)

    if fout:
        np.savetxt('../matlab/F.csv', F, delimiter=',')

    return Y, F
def extract_raw_samples(year=2010, crime_t=['total'], crime_rate=True):
    """
    Extract all samples with raw labels and features. Return None if the 
    corresponding feature is not selected.
    
    This function is called once only to avoid unnecessary disk I/O.
    
    Input:
    year        - which year to study
    crime_t     - crime types of interest, e.g. 'total'
    crime_rate  - predict crime_rate or not (count)
    
    Output:
    Y - crime rate / count
    D - demo feature
    P - POI feature
    Tf - taxi flow matrix (count)
    Gd - geo weight matrix
    """
    # Crime count
    y_cnt = retrieve_crime_count(year, col = crime_t)
    
    # Crime rate / count
    demo = generate_corina_features()
    population = demo[1][:,0].reshape(demo[1].shape[0], 1)
    Y = y_cnt / population * 10000 if crime_rate else y_cnt
    assert(Y.shape == (77,1))
    
    # Demo features
    D = demo[1]
    
    # POI features
    P = getFourSquarePOIDistribution(useRatio=False)
    
    # Taxi flow matrix
    Tf = getTaxiFlow(normalization="none")
    
    # Geo weight matrix
    Gd = generate_geographical_SpatialLag_ca()
    
    return Y, D, P, Tf, Gd
Пример #8
0
def extract_raw_samples(year=2010, crime_t=['total'], crime_rate=True):
    """
    Extract all samples with raw labels and features. Return None if the 
    corresponding feature is not selected.
    
    This function is called once only to avoid unnecessary disk I/O.
    
    Input:
    year        - which year to study
    crime_t     - crime types of interest, e.g. 'total'
    crime_rate  - predict crime_rate or not (count)
    
    Output:
    Y - crime rate / count
    D - demo feature
    P - POI feature
    Tf - taxi flow matrix (count)
    Gd - geo weight matrix
    """
    # Crime count
    y_cnt = retrieve_crime_count(year, col = crime_t)
    
    # Crime rate / count
    demo = generate_corina_features()
    population = demo[1][:,0].reshape(demo[1].shape[0], 1)
    Y = y_cnt / population * 10000 if crime_rate else y_cnt
    assert(Y.shape == (N,1))
    
    # Demo features
    D = demo[1]
    
    # POI features
    P = getFourSquarePOIDistribution(useRatio=False)
    
    # Taxi flow matrix
    Tf = getTaxiFlow(normalization="none")
    
    # Geo weight matrix
    Gd = generate_geographical_SpatialLag_ca()
    
    return Y, D, P, Tf, Gd
Пример #9
0
def prepare_features(features=["poi", "taxi", "demos", "spatiallag"],
                     leaveOneOut=-1):

    Y = retrieve_crime_count(year=2013)
    Y = Y.reshape((-1, 1))
    if leaveOneOut > 0:
        Y = np.delete(Y, leaveOneOut - 1, 0)

    if "poi" in features:
        poi_dist = getFourSquareCount(leaveOut=leaveOneOut)

    if "taxi" in features:
        F_taxi = getTaxiFlow(leaveOut=leaveOneOut, normalization="bysource")

    if "demos" in features:
        C = generate_corina_features(leaveOut=leaveOneOut)
        demos = [
            'total population', 'population density', 'disadvantage index',
            'residential stability', 'ethnic diversity'
        ]
        demos_idx = [C[0].index(ele) for ele in demos]
        D = C[1][:, demos_idx]

        popul = C[1][:, 0].reshape(C[1].shape[0], 1)
        Y = np.divide(Y, popul) * 10000

    if "spatiallag" in features:
        W2 = generate_geographical_SpatialLag_ca(leaveOut=leaveOneOut)

    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)

    f = np.ones(f2.shape)
    f = np.concatenate((f, D, f2, ftaxi, poi_dist), axis=1)

    S = np.zeros(W2.shape)
    S[W2 > 0] = 1
    #    S = np.ones(W2.shape)

    return Y, f, S
Пример #10
0
def generateInput_v4(fout=False):
    """
    Generate complete observation matrix
    """
    des, X = generate_corina_features('ca')
    pvt = X[:,2]    # poverty index of each CA
    popul = X[:,0].reshape(X.shape[0],1)
    
#    poi_cnt = getFourSquareCount()
#    poi_cnt = np.divide(poi_cnt, popul) * 10000
    
    poi_dist = getFourSquarePOIDistribution()
    poi_dist = np.divide(poi_dist, popul) * 10000
    
    F_dist = generate_geographical_SpatialLag_ca()
    F_flow = generate_transition_SocialLag(year=2010, lehd_type=0, region='ca')
    F_taxi = getTaxiFlow()

    Y = retrieve_crime_count(year=2010, col=['total'], region='ca')
    Y = np.divide(Y, popul) * 10000

    F = []
    n = Y.size
    for i in range(n):
        for j in range(n):
            if i != j:
                wij = np.array( [F_dist[i,j], 
                                actualFlowInteraction(pvt[i], pvt[j]) * F_flow[i,j],
                                F_taxi[i,j] ] )
#                fij = np.concatenate( (X[i], poi_dist[i], wij * Y[j][0]) , 0)
                fij = np.concatenate( (X[i], wij * Y[j,0]) , 0)
                F.append(fij)
    F = np.array(F)
    np.append(F, np.ones( (F.shape[0], 1) ), axis=1)

    if fout:
        np.savetxt('../matlab/F.csv', F, delimiter=',')

    return Y, F
Пример #11
0
def prepare_features(features=["poi", "taxi", "demos", "spatiallag"], leaveOneOut=-1):
    
    Y = retrieve_crime_count(year=2013)
    Y = Y.reshape((-1,1))
    if leaveOneOut > 0:
        Y = np.delete(Y, leaveOneOut-1, 0)
    
    if "poi" in features:
        poi_dist = getFourSquareCount(leaveOut=leaveOneOut)
    
    if "taxi" in features:
        F_taxi = getTaxiFlow(leaveOut=leaveOneOut, normalization="bysource")

    if "demos" in features:
        C = generate_corina_features(leaveOut=leaveOneOut)
        demos = ['total population', 'population density', 'disadvantage index', 
                 'residential stability', 'ethnic diversity']
        demos_idx = [C[0].index(ele) for ele in demos]
        D = C[1][:,demos_idx]
        
        popul = C[1][:,0].reshape(C[1].shape[0],1)
        Y = np.divide(Y, popul) * 10000
        
    if "spatiallag" in features:
        W2 = generate_geographical_SpatialLag_ca(leaveOut=leaveOneOut)
        
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    f = np.ones(f2.shape)
    f = np.concatenate( (f, D, f2, ftaxi, poi_dist), axis=1 )
    
    S = np.zeros(W2.shape)
    S[W2 > 0] = 1
#    S = np.ones(W2.shape)
    
    return Y, f, S
Пример #12
0
def line_taxiflow_crime():
    s = getTaxiFlow(normalization='bydestination')
    
    Y = retrieve_crime_count(2010, col=['total'], region='ca')
    h, D = generate_corina_features(region='ca')
    popul = D[:,0].reshape(D.shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
    f1 = np.dot(s, Y)
    
    plt.figure()
    plt.scatter(f1, Y)
    plt.axis([0, 6000, 0, 6000])
    idx = [31, 46]
    sf1 = f1[idx]
    sY = Y[idx]
    plt.scatter(sf1, sY, edgecolors='red', s=50, linewidths=2 )
    plt.figtext(0.33, 0.8, '#32', fontsize='large')
    plt.figtext(0.75, 0.34, '#47', fontsize='large')
    plt.xlabel('Hyperlink by taxi flow feature value', fontsize='x-large')
    plt.ylabel('Crime rate', fontsize='x-large')
    
    plt.savefig('taxi-flow-percent.pdf', format='pdf')
    return f1
Пример #13
0
def coefficients_pvalue(lagsFlag,
                        tempflag="templag",
                        selfflow="selfflow",
                        itersN="10",
                        exposure="exposure",
                        year=2010,
                        lehdType="total",
                        crimeType='total'):
    """Return the pvalue of Negative Binomial model coefficients.
    Permutation test + leave-one-out evaluation
    Retrieve leave-one-out error distribution. To determine the p-value
    
    The model to be evaluated is the NB model.
    The features used in this model only includes spatial lag, scial lag, and demographics.
    
    Keyword arguments:
    lehdType -- the type of LEHD flow (default "total", alternative "lowincome")
    crimeType -- the type of predicated crime (default "violent", alternative "total")
    exposure -- exposure or noexposure
    lagsFlag -- social lag, spatial lag, socai lag disadv, spatial lag disadv
    """

    C = generate_corina_features('ca')
    demo = pd.DataFrame(data=C[1], columns=C[0], dtype="float")
    W1 = generate_geographical_SpatialLag_ca()

    # the LEHD type
    if lehdType == "lowincome":
        W2 = generate_transition_SocialLag(year=year,
                                           lehd_type=4,
                                           region='ca',
                                           normalization='none')
    elif lehdType == "total":
        W2 = generate_transition_SocialLag(year=year,
                                           lehd_type=0,
                                           region='ca',
                                           normalization='none')
    elif lehdType == "taxi":
        W2 = getTaxiFlow(normalization="none")

    if selfflow == 'selfflow':
        s = [W2[i, i] for i in range(W2.shape[0])]
        np.savetxt(here + "/../R/pvalue-selfflow.csv", s)

    for i in range(W2.shape[0]):
        W2[i, i] = 0

    # the predicated crime type
    violentCrime = [
        'HOMICIDE', 'CRIM SEXUAL ASSAULT', 'BATTERY', 'ROBBERY', 'ARSON',
        'DOMESTIC VIOLENCE', 'ASSAULT'
    ]
    if crimeType == 'total':
        Y = retrieve_crime_count(year=year, col=['total'], region='ca')
        if tempflag == "templag":
            ystart = (year - 3) if year - 3 >= 2003 else 2003
            tlag = []
            for ytmp in range(ystart, year):
                yt = retrieve_crime_count(year=ytmp,
                                          col=['total'],
                                          region='ca')
                tlag.append(yt)
            yt = np.mean(tlag, axis=0)
            assert yt.shape == Y.shape
            np.savetxt(here + "/../R/pvalue-templag.csv", yt)
    elif crimeType == 'violent':
        Y = retrieve_crime_count(year=year, col=violentCrime, region='ca')
        if tempflag == "templag":
            ystart = (year - 3) if year - 3 >= 2003 else 2003
            tlag = []
            for ytmp in range(ystart, year):
                yt = retrieve_crime_count(year=ytmp,
                                          col=violentCrime,
                                          region='ca')
                tlag.append(yt)
            yt = np.mean(tlag, axis=0)
            assert yt.shape == Y.shape
            np.savetxt(here + "/../R/pvalue-templag.csv", yt)

    demo.to_csv(here + "/../R/pvalue-demo.csv", index=False)
    np.savetxt(here + "/../R/pvalue-spatiallag.csv", W1, delimiter=",")
    np.savetxt(here + "/../R/pvalue-sociallag.csv", W2, delimiter=",")
    np.savetxt(here + "/../R/pvalue-crime.csv", Y)

    # use a multiprocess Pool to run subprocess in parallel
    socialNorm = ['bydestination', 'bysource', 'bypair']
    os.chdir(here + "/../R")
    from multiprocessing import Pool, cpu_count
    subProcessPool = Pool(cpu_count() / 2)

    for sn in socialNorm[1:2]:
        for logpop in ["logpop", "pop"][0:1]:
            for logpopden in ["logpopdensty", "popdensty"][0:1]:
                subProcessPool.apply_async(
                    subPworker,
                    (lehdType, crimeType, sn, exposure, logpop, lagsFlag,
                     itersN, logpopden, tempflag, selfflow))


#                p = subprocess.Popen(['Rscript', 'pvalue-evaluation.R',
#                                      lehdType+"lehd", crimeType+"crime", sn,
#                                      exposure, logpop, lagsFlag, itersN,
#                                      logpopden, tempflag, selfflow])
#                p.wait()
    subProcessPool.close()
    subProcessPool.join()
Пример #14
0
def permutationTest_accuracy(iters, permute='taxiflow'):
    """
    Evaluate crime rate
    
    use full feature set:
        Corina, spaitallag, taxiflow, POIdist
    evaluate on 2013
    
    at CA level
    
    leave one out
    
    permutation
        permute one feature 1000 times takes roughly 30-40 minutes.
        The results are dumped as "permute-{feature}.pickle"
    """
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=2013)

    C = generate_corina_features()
    D = C[1]

    popul = C[1][:, 0].reshape(C[1].shape[0], 1)
    Y = np.divide(Y, popul) * 10000

    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)

    nb_mae = []
    nb_mre = []
    lr_mae = []
    lr_mre = []
    for i in range(iters):
        if permute == 'corina':
            D = np.random.permutation(D)
        elif permute == 'spatiallag':
            yhat = np.random.permutation(Y)
            f2 = np.dot(W2, yhat)
        elif permute == 'taxiflow':
            yhat = np.random.permutation(Y)
            ftaxi = np.dot(F_taxi, Y)
        elif permute == 'POIdist':
            poi_dist = np.random.permutation(poi_dist)
        f = np.ones(f2.shape)
        f = np.concatenate((f, D, f2, ftaxi, poi_dist), axis=1)
        header = ['intercept'] + C[0] + [ 'spatiallag', 'taxiflow'] + \
            ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment',
                           'POI outdoors recreation', 'POI education', 'POI nightlife',
                           'POI professional', 'POI shops', 'POI event']
        df = pd.DataFrame(f, columns=header)

        np.savetxt("Y.csv", Y, delimiter=",")
        df.to_csv("f.csv", sep=",", index=False)

        # NB permute
        nbres = subprocess.check_output(['Rscript', 'nbr_eval.R', 'ca'])
        ls = nbres.split(' ')
        nb_mae.append(float(ls[0]))
        nb_mre.append(float(ls[2]))

        mae2, mre2 = permutation_Test_LR(Y, f)
        lr_mae.append(mae2)
        lr_mre.append(mre2)

        if i % 10 == 0:
            print i

    print '{0} iterations finished.'.format(iters)
    print pvalue(412.305, lr_mae), pvalue(0.363, lr_mre), \
        pvalue(319.86, nb_mae), pvalue(0.281, nb_mre)
    return nb_mae, nb_mre, lr_mae, lr_mre
Пример #15
0
def leaveOneOut_evaluation_onChicagoCrimeData(year=2010,
                                              features=["all"],
                                              crime_t=['total'],
                                              flow_type=0,
                                              verboseoutput=False,
                                              region='ca',
                                              weightSocialFlow=True,
                                              useRate=True,
                                              logFeatures=[]):
    """
    Generate the social lag from previous year
    use income/race/education of current year
    """
    warnings.warn("The leave one out in nbr_eval.R is unfair")
    if 'sociallag' in features:
        W = generate_transition_SocialLag(year,
                                          lehd_type=flow_type,
                                          region=region,
                                          normalization='pair')

    # add POI distribution and taxi flow
    poi_dist = getFourSquarePOIDistribution(useRatio=False, gridLevel=region)
    F_taxi = getTaxiFlow(normalization="bydestination", gridLevel=region)

    if region == 'ca':
        W2 = generate_geographical_SpatialLag_ca()

        Yhat = retrieve_crime_count(year - 1, col=crime_t)
        #        h = retrieve_health_data()
        #        Y = h[0].reshape((77,1))
        Y = retrieve_crime_count(year, col=crime_t)
        C = generate_corina_features()
        popul = C[1][:, 0].reshape(C[1].shape[0], 1)

        if 'sociallag' in features:
            """ use poverty demographics to weight social lag """
            wC = 28  # 130.0 if useRate else 32.0     # constant parameter
            if weightSocialFlow:
                poverty = C[1][:, 2]
                for i in range(W.shape[0]):
                    for j in range(W.shape[1]):
                        W[i][j] *= np.exp(-np.abs(poverty[i] - poverty[j]) /
                                          wC)

        # crime count is normalized by the total population as crime rate
        # here we use the crime count per 10 thousand residents
        if useRate:
            Y = np.divide(Y, popul) * 10000
            Yhat = np.divide(Yhat, popul) * 10000
    elif region == 'tract':
        W2, tractkey = generate_geographical_SpatialLag()

        Yhat_map = retrieve_crime_count(year - 1, col=crime_t, region='tract')
        Yhat = np.array([Yhat_map[k]
                         for k in tractkey]).reshape(len(Yhat_map), 1)

        Y_map = retrieve_crime_count(year, col=crime_t, region='tract')
        Y = np.array([Y_map[k] for k in tractkey]).reshape(len(Y_map), 1)

        C = generate_corina_features(region='tract')
        C_mtx = []
        cnt = 0

        for k in tractkey:
            if k in C[1]:
                C_mtx.append(C[1][k])
            else:
                cnt += 1
                C_mtx.append([0 for i in range(7)])

        C = (C[0], np.array(C_mtx))

        # at tract level we don't normalize by population, since the tract is
        # defined as region with around 2000 population
        if useRate:
            pass

    i = retrieve_income_features()
    e = retrieve_education_features()
    r = retrieve_race_features()

    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)

    # add intercept
    columnName = ['intercept']
    f = np.ones(f2.shape)
    lrf = np.copy(f)

    if "all" in features:
        f = np.concatenate((f, f1, i[1], e[1], r[1]), axis=1)
        f = pd.DataFrame(f, columns=['social lag'] + i[0] + e[0] + r[0])
    if "sociallag" in features:
        f1 = np.dot(W, Y)
        if 'sociallag' in logFeatures:
            f = np.concatenate((f, np.log(f1)), axis=1)
        else:
            f = np.concatenate((f, f1), axis=1)
        lrf = np.concatenate((f, f1), axis=1)
        columnName += ['social lag']
    if "income" in features:
        f = np.concatenate((f, i[1]), axis=1)
        lrf = np.concatenate((f, i[1]), axis=1)
        columnName += i[0]
    if "race" in features:
        f = np.concatenate((f, r[1]), axis=1)
        lrf = np.concatenate((f, r[1]), axis=1)
        columnName += r[0]
    if "education" in features:
        f = np.concatenate((f, e[1]), axis=1)
        lrf = np.concatenate((f, e[1]), axis=1)
        columnName += e[0]
    if 'corina' in features:
        f = np.concatenate((f, C[1]), axis=1)
        lrf = np.concatenate((f, C[1]), axis=1)
        columnName += C[0]
    if 'spatiallag' in features:
        if 'spatiallag' in logFeatures:
            f = np.concatenate((f, np.log(f2)), axis=1)
        else:
            f = np.concatenate((f, f2), axis=1)
        lrf = np.concatenate((f, f2), axis=1)
        columnName += ['spatial lag']
    if 'taxiflow' in features:
        if 'taxiflow' in logFeatures:
            f = np.concatenate((f, np.log(ftaxi)), axis=1)
        else:
            f = np.concatenate((f, ftaxi), axis=1)
        lrf = np.concatenate((f, ftaxi), axis=1)
        columnName += ['taxi flow']
    if 'POIdist' in features:
        f = np.concatenate((f, poi_dist), axis=1)
        lrf = np.concatenate((f, poi_dist), axis=1)
        columnName += [
            'POI food', 'POI residence', 'POI travel',
            'POI arts entertainment', 'POI outdoors recreation',
            'POI education', 'POI nightlife', 'POI professional', 'POI shops',
            'POI event'
        ]

    if 'temporallag' in features:
        f = np.concatenate((f, np.log(Yhat)), axis=1)
        lrf = np.concatenate((f, Yhat), axis=1)
        columnName += ['temporal lag']

    nbres = NB_training_R(f, columnName, Y, region, verboseoutput)
    print NB_training_python(f, Y)
    mae2, var2, mre2 = LR_training_python(lrf, Y, verboseoutput)

    if verboseoutput:
        print "Linear Regression MAE", mae2, "std", var2, "MRE", mre2
    else:
        print nbres
        print mae2, var2, mre2
        return np.array([[float(ele) for ele in nbres.split(" ")],
                         [mae2, var2, mre2]])
Пример #16
0
def coefficients_pvalue(lagsFlag, tempflag="templag", selfflow="selfflow", itersN="10", 
                        exposure="exposure", year=2010, lehdType="total", 
                        crimeType='total'):
    """Return the pvalue of Negative Binomial model coefficients.
    Permutation test + leave-one-out evaluation
    Retrieve leave-one-out error distribution. To determine the p-value
    
    The model to be evaluated is the NB model.
    The features used in this model only includes spatial lag, scial lag, and demographics.
    
    Keyword arguments:
    lehdType -- the type of LEHD flow (default "total", alternative "lowincome")
    crimeType -- the type of predicated crime (default "violent", alternative "total")
    exposure -- exposure or noexposure
    lagsFlag -- social lag, spatial lag, socai lag disadv, spatial lag disadv
    """
    
    C = generate_corina_features('ca')
    demo = pd.DataFrame(data=C[1], columns=C[0], dtype="float")
    W1 = generate_geographical_SpatialLag_ca()
    
    # the LEHD type
    if lehdType == "lowincome":
        W2 = generate_transition_SocialLag(year=year, lehd_type=4, region='ca',
                                       normalization='none')
    elif lehdType == "total":
        W2 = generate_transition_SocialLag(year=year, lehd_type=0, region='ca',
                                           normalization='none')
    elif lehdType == "taxi":
        W2 = getTaxiFlow(normalization="none")
    
    if selfflow == 'selfflow':
        s = [W2[i,i] for i in range(W2.shape[0])]
        np.savetxt(here + "/../R/pvalue-selfflow.csv", s)
            
    
    for i in range(W2.shape[0]):
        W2[i,i] = 0
    
    # the predicated crime type                                           
    violentCrime = ['HOMICIDE', 'CRIM SEXUAL ASSAULT', 'BATTERY', 'ROBBERY', 
                'ARSON', 'DOMESTIC VIOLENCE', 'ASSAULT']
    if crimeType == 'total':
        Y = retrieve_crime_count(year=year, col=['total'], region='ca')
        if tempflag == "templag":
            ystart = (year-3) if year - 3 >= 2003 else 2003
            tlag = []
            for ytmp in range(ystart, year):
                yt = retrieve_crime_count(year=ytmp, col=['total'], region='ca')
                tlag.append(yt)
            yt = np.mean(tlag, axis=0)
            assert yt.shape == Y.shape
            np.savetxt(here + "/../R/pvalue-templag.csv", yt)
    elif crimeType == 'violent':
        Y = retrieve_crime_count(year=year, col=violentCrime, region='ca')
        if tempflag == "templag":
            ystart = (year-3) if year - 3 >= 2003 else 2003
            tlag = []
            for ytmp in range(ystart, year):
                yt = retrieve_crime_count(year=ytmp, col=violentCrime, region='ca')
                tlag.append(yt)
            yt = np.mean(tlag, axis=0)
            assert yt.shape == Y.shape
            np.savetxt(here + "/../R/pvalue-templag.csv", yt)
    
        
    demo.to_csv(here + "/../R/pvalue-demo.csv", index=False)
    np.savetxt(here + "/../R/pvalue-spatiallag.csv", W1, delimiter=",")
    np.savetxt(here + "/../R/pvalue-sociallag.csv", W2, delimiter=",")
    np.savetxt(here + "/../R/pvalue-crime.csv", Y)
    
    

    # use a multiprocess Pool to run subprocess in parallel
    socialNorm = ['bydestination', 'bysource', 'bypair']
    os.chdir(here + "/../R")
    from multiprocessing import Pool, cpu_count
    subProcessPool = Pool(cpu_count() / 2)

    for sn in socialNorm[1:2]:
        for logpop in ["logpop", "pop"][0:1]:
            for logpopden in ["logpopdensty", "popdensty"][0:1]:
                subProcessPool.apply_async(subPworker, (lehdType, crimeType, sn, exposure, logpop, lagsFlag, itersN, logpopden, tempflag, selfflow))
#                p = subprocess.Popen(['Rscript', 'pvalue-evaluation.R', 
#                                      lehdType+"lehd", crimeType+"crime", sn, 
#                                      exposure, logpop, lagsFlag, itersN, 
#                                      logpopden, tempflag, selfflow])
#                p.wait()
    subProcessPool.close()
    subProcessPool.join()
Пример #17
0
def permutationTest_accuracy(iters, permute='taxiflow'):
    """
    Evaluate crime rate
    
    use full feature set:
        Corina, spaitallag, taxiflow, POIdist
    evaluate on 2013
    
    at CA level
    
    leave one out
    
    permutation
        permute one feature 1000 times takes roughly 30-40 minutes.
        The results are dumped as "permute-{feature}.pickle"
    """
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=2013)
    
    
    C = generate_corina_features()
    D = C[1]
    
    popul = C[1][:,0].reshape(C[1].shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
     
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    
    nb_mae = []
    nb_mre = []
    lr_mae = []
    lr_mre = []
    for i in range(iters):
        if permute == 'corina':
            D = np.random.permutation(D)
        elif permute == 'spatiallag':
            yhat = np.random.permutation(Y)
            f2 = np.dot(W2, yhat)
        elif permute == 'taxiflow':            
            yhat = np.random.permutation(Y)
            ftaxi = np.dot(F_taxi, Y)
        elif permute == 'POIdist':
            poi_dist = np.random.permutation(poi_dist)
        f = np.ones(f2.shape)
        f = np.concatenate( (f, D, f2, ftaxi, poi_dist), axis=1 )
        header = ['intercept'] + C[0] + [ 'spatiallag', 'taxiflow'] + \
            ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                           'POI outdoors recreation', 'POI education', 'POI nightlife', 
                           'POI professional', 'POI shops', 'POI event']
        df = pd.DataFrame(f, columns=header)
        
        np.savetxt("Y.csv", Y, delimiter=",")
        df.to_csv("f.csv", sep=",", index=False)
        
        # NB permute
        nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca'] )
        ls = nbres.split(' ')
        nb_mae.append( float(ls[0]) )
        nb_mre.append( float(ls[2]) )

        mae2, mre2 = permutation_Test_LR(Y, f)
        lr_mae.append(mae2)
        lr_mre.append(mre2)
        
        if i % 10 == 0:
            print i
        
    print '{0} iterations finished.'.format(iters)
    print pvalue(412.305, lr_mae), pvalue(0.363, lr_mre), \
        pvalue(319.86, nb_mae), pvalue(0.281, nb_mre)
    return nb_mae, nb_mre, lr_mae, lr_mre
Пример #18
0
def leaveOneOut_evaluation_onChicagoCrimeData(year=2010, features= ["all"], 
                                              crime_t=['total'], flow_type=0, 
                                              verboseoutput=False, region='ca',
                                              weightSocialFlow=True, 
                                              useRate=True, logFeatures = []):
    """
    Generate the social lag from previous year
    use income/race/education of current year
    """
    warnings.warn("The leave one out in nbr_eval.R is unfair")
    if 'sociallag' in features:
        W = generate_transition_SocialLag(year, lehd_type=flow_type, region=region,
                                          normalization='pair')
    
    
    # add POI distribution and taxi flow
    poi_dist = getFourSquarePOIDistribution(useRatio=False, gridLevel=region)
    F_taxi = getTaxiFlow(normalization="bydestination", gridLevel=region)
        
        
    if region == 'ca':
        W2 = generate_geographical_SpatialLag_ca()
        
        Yhat = retrieve_crime_count(year-1, col = crime_t)
#        h = retrieve_health_data()
#        Y = h[0].reshape((77,1))
        Y = retrieve_crime_count(year, col = crime_t)
        C = generate_corina_features()
        popul = C[1][:,0].reshape(C[1].shape[0],1)
        
        
        if 'sociallag' in features:
            """ use poverty demographics to weight social lag """
            wC = 28 # 130.0 if useRate else 32.0     # constant parameter
            if weightSocialFlow:
                poverty = C[1][:,2]        
                for i in range(W.shape[0]):
                    for j in range (W.shape[1]):
                        W[i][j] *= np.exp( - np.abs(poverty[i] - poverty[j]) / wC )
        
        # crime count is normalized by the total population as crime rate
        # here we use the crime count per 10 thousand residents
        if useRate:
            Y = np.divide(Y, popul) * 10000
            Yhat = np.divide(Yhat, popul) * 10000
    elif region == 'tract':
        W2, tractkey = generate_geographical_SpatialLag()
    
        Yhat_map = retrieve_crime_count(year-1, col = crime_t, region='tract')
        Yhat = np.array( [Yhat_map[k] for k in tractkey] ).reshape( len(Yhat_map), 1)
        
        Y_map = retrieve_crime_count(year, col = crime_t, region='tract')
        Y = np.array( [Y_map[k] for k in tractkey] ).reshape( len(Y_map), 1 )
        
        C = generate_corina_features(region='tract')
        C_mtx = []
        cnt = 0
        
        for k in tractkey:
            if k in C[1]:
                C_mtx.append(C[1][k])
            else:
                cnt += 1
                C_mtx.append( [0 for i in range(7)] )
        
        C = ( C[0], np.array( C_mtx ) )
        
        
        # at tract level we don't normalize by population, since the tract is
        # defined as region with around 2000 population
        if useRate:
            pass
    
    
    
    i = retrieve_income_features()
    e = retrieve_education_features()
    r = retrieve_race_features()
    
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    
    # add intercept
    columnName = ['intercept']
    f = np.ones(f2.shape)
    lrf = np.copy(f)

    if "all" in features:
        f = np.concatenate( (f, f1, i[1], e[1], r[1]), axis=1)
        f = pd.DataFrame(f, columns=['social lag'] + i[0] + e[0] + r[0])
    if "sociallag" in features:        
        f1 = np.dot(W, Y)
        if 'sociallag' in logFeatures:
            f = np.concatenate( (f, np.log(f1)), axis=1 )
        else:
            f = np.concatenate( (f, f1), axis=1)
        lrf = np.concatenate( (f, f1), axis=1)
        columnName += ['social lag']
    if  "income" in features:
        f = np.concatenate( (f, i[1]), axis=1)
        lrf = np.concatenate( (f, i[1]), axis=1)
        columnName += i[0]
    if "race" in features:
        f = np.concatenate( (f, r[1]), axis=1)
        lrf = np.concatenate( (f, r[1]), axis=1)
        columnName += r[0]
    if "education" in features :
        f = np.concatenate( (f, e[1]), axis=1)
        lrf = np.concatenate( (f, e[1]), axis=1)
        columnName += e[0]
    if 'corina' in features :
        f = np.concatenate( (f, C[1]), axis=1)
        lrf = np.concatenate( (f, C[1]), axis=1)
        columnName += C[0]
    if 'spatiallag' in features:
        if 'spatiallag' in logFeatures:
            f = np.concatenate( (f, np.log(f2)), axis=1)
        else:
            f = np.concatenate( (f, f2), axis=1)
        lrf = np.concatenate( (f, f2), axis=1)
        columnName += ['spatial lag']
    if 'taxiflow' in features:
        if 'taxiflow' in logFeatures:
            f = np.concatenate( (f, np.log(ftaxi)), axis=1 )
        else:
            f = np.concatenate( (f, ftaxi), axis=1 )
        lrf = np.concatenate( (f, ftaxi), axis=1 )
        columnName += ['taxi flow']
    if 'POIdist' in features:
        f = np.concatenate( (f, poi_dist), axis=1 )
        lrf = np.concatenate( (f, poi_dist), axis=1 )
        columnName += ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                       'POI outdoors recreation', 'POI education', 'POI nightlife', 
                       'POI professional', 'POI shops', 'POI event']

    
    if 'temporallag' in features:
        f = np.concatenate( (f, np.log(Yhat)), axis=1)
        lrf = np.concatenate( (f, Yhat), axis=1)
        columnName += ['temporal lag']
        
    nbres = NB_training_R(f, columnName, Y, region, verboseoutput)
    print NB_training_python(f, Y)
    mae2, var2, mre2 = LR_training_python(lrf, Y, verboseoutput)
    
    if verboseoutput:
        print "Linear Regression MAE", mae2, "std", var2, "MRE", mre2
    else:
        print nbres
        print mae2, var2, mre2
        return np.array([[float(ele) for ele in nbres.split(" ")], [mae2, var2, mre2]])
    for h in range(24):
        Fn = similarityMatrix(hdge[h])
        x, y, xp, yp, lp = generate_point(Fn, Y)
        f = plt.figure()
        plt.scatter(x, y, color='red')
        plt.show()


demo = generate_corina_features()
y_cnt = retrieve_crime_count(2013)

population = demo[1][:, 0].reshape(demo[1].shape[0], 1)
Y = y_cnt / population * 10000

F = getTaxiFlow(normalization="none")

x, y, xp, yp, lp = generate_point(F, Y)

plt.rc("axes", linewidth=2)
f = plt.figure(figsize=(8, 6))
plt.scatter(x, y, s=16)

plt.plot([-100, -100, 3500, -100], [3000, -3000, 0, 3000],
         linewidth=2,
         color='blue')
plt.scatter(xp, yp, color='red', s=28)
plt.xlabel("Taxi flow from $r_i$ to $r_j$", fontsize=20)
plt.ylabel("Crime rate difference $y_i - y_j$", fontsize=20)
#for i in range(len(lp)):
#    a.axes.annotate(lp[i], xy=(xp[i], yp[i]))
Пример #20
-1
def NB_coefficients(year=2010):
    poi_dist = getFourSquarePOIDistribution(useRatio=False)
    F_taxi = getTaxiFlow(normalization="bydestination")
    W2 = generate_geographical_SpatialLag_ca()
    Y = retrieve_crime_count(year=year)
    C = generate_corina_features()
    D = C[1]

    popul = C[1][:,0].reshape(C[1].shape[0],1)
    Y = np.divide(Y, popul) * 10000
    
    f2 = np.dot(W2, Y)
    ftaxi = np.dot(F_taxi, Y)
    
    f = np.concatenate( (D, f2, ftaxi, poi_dist), axis=1 )
    mms = MinMaxScaler(copy=False)
    mms.fit(f)
    mms.transform(f)
    header = C[0] + [ 'spatiallag', 'taxiflow'] + \
        ['POI food', 'POI residence', 'POI travel', 'POI arts entertainment', 
                       'POI outdoors recreation', 'POI education', 'POI nightlife', 
                       'POI professional', 'POI shops', 'POI event']
    df = pd.DataFrame(f, columns=header)
    
    np.savetxt("Y.csv", Y, delimiter=",")
    df.to_csv("f.csv", sep=",", index=False)
    
    # NB permute
    nbres = subprocess.check_output( ['Rscript', 'nbr_eval.R', 'ca', 'coefficient'] )
    print nbres
    
    ls = nbres.strip().split(" ")
    coef = [float(e) for e in ls]
    print coef
    return coef, header