Пример #1
0
def permutation_Test_LR(Y, f):

    Y = Y.reshape((len(Y), ))
    loo = cross_validation.LeaveOneOut(len(Y))

    errors = []
    for train_idx, test_idx in loo:
        f_train, f_test = f[train_idx], f[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]

        r = linearRegression(f_train, Y_train)
        y = r.predict(f_test)
        errors.append(np.abs(Y_test - y))

    mae = np.mean(errors)
    mre = mae / Y.mean()

    return mae, mre
Пример #2
0
def permutation_Test_LR(Y, f):
    
    Y = Y.reshape((len(Y),))
    loo = cross_validation.LeaveOneOut(len(Y))
    
    errors = []
    for train_idx, test_idx in loo:
        f_train, f_test = f[train_idx], f[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        
        r = linearRegression(f_train, Y_train)
        y = r.predict(f_test)
        errors.append(np.abs(Y_test - y))
        
    mae = np.mean(errors)
    mre = mae / Y.mean()
    
    return mae, mre
Пример #3
0
def LR_training_python(lrf, Y, verboseoutput):
    Y = Y.reshape((len(Y), ))
    loo = cross_validation.LeaveOneOut(len(Y))
    mae2 = 0
    errors2 = []
    for train_idx, test_idx in loo:
        f_train, f_test = lrf[train_idx], lrf[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        if not np.any(np.isnan(f_train)) and np.all(np.isfinite(f_train)):
            r2 = linearRegression(f_train, Y_train)
            y2 = r2.predict(f_test)
            errors2.append(np.abs(Y_test - y2))
            if verboseoutput:
                print Y_test[0], y2[0]
        else:
            print 'nan or infinite'
            pass

    mae2 = np.mean(errors2)
    var2 = np.sqrt(np.var(errors2))
    mre2 = mae2 / Y.mean()
    return mae2, var2, mre2
Пример #4
0
def LR_training_python(lrf, Y, verboseoutput):    
    Y = Y.reshape((len(Y),))
    loo = cross_validation.LeaveOneOut(len(Y))
    mae2 = 0
    errors2 = []
    for train_idx, test_idx in loo:
        f_train, f_test = lrf[train_idx], lrf[test_idx]
        Y_train, Y_test = Y[train_idx], Y[test_idx]
        if not np.any(np.isnan(f_train)) and np.all(np.isfinite(f_train)):
            r2 = linearRegression(f_train, Y_train)
            y2 = r2.predict(f_test)
            errors2.append( np.abs( Y_test - y2 ) )
            if verboseoutput:
                print Y_test[0], y2[0]
        else:
            print 'nan or infinite'
            pass

    mae2 = np.mean(errors2)
    var2 = np.sqrt( np.var(errors2) )
    mre2 = mae2 / Y.mean()
    return mae2, var2, mre2
Пример #5
0
def tenFoldCV_onChicagoCrimeData(features=['corina'],
                                 CVmethod='10Fold',
                                 P=10,
                                 NUM_ITER=20,
                                 SHUFFLE=True):
    """
    Use different years data to train the NB model
    """
    YEARS = range(2003, 2014)

    Y = []
    C = []
    FL = []
    GL = []
    T = []
    for year in YEARS:
        W = generate_transition_SocialLag(year, lehd_type=0)
        Yhat = retrieve_crime_count(year - 1, ['total'])
        y = retrieve_crime_count(year, ['total'])
        c = generate_corina_features()
        popul = c[1][:, 0].reshape((77, 1))

        # crime count is normalized by the total population as crime rate
        # here we use the crime count per 10 thousand residents
        y = np.divide(y, popul) * 10000
        Yhat = np.divide(Yhat, popul) * 10000

        W2 = generate_geographical_SpatialLag_ca()

        f1 = np.dot(W, Yhat)
        f2 = np.dot(W2, Yhat)

        FL.append(f1)
        GL.append(f2)
        Y.append(y)
        T.append(Yhat)
        C.append(c[1])

    Y = np.concatenate(Y, axis=0)
    columnName = ['intercept']
    f = np.ones(Y.shape)
    if 'corina' in features:
        C = np.concatenate(C, axis=0)
        f = np.concatenate((f, C), axis=1)
        columnName += c[0]
    if 'sociallag' in features:
        FL = np.concatenate(FL, axis=0)
        f = np.concatenate((f, FL), axis=1)
        columnName += ['sociallag']
    if 'spatiallag' in features:
        GL = np.concatenate(GL, axis=0)
        f = np.concatenate((f, GL), axis=1)
        columnName += ['spatiallag']
    if 'temporallag' in features:
        T = np.concatenate(T, axis=0)
        f = np.concatenate((f, T), axis=1)
        columnName += ['temporallag']

    if SHUFFLE:
        f, Y = shuffle(f, Y)

    if CVmethod == '10Fold':
        splt = cross_validation.KFold(n=f.shape[0], n_folds=10, shuffle=True)
    elif CVmethod == 'leaveOneOut':
        splt = cross_validation.LeaveOneOut(n=f.shape[0])
    elif CVmethod == 'leavePOut':
        splt = cross_validation.LeavePOut(n=f.shape[0], p=P)

    mae1 = []
    mae2 = []
    mre1 = []
    mre2 = []
    sd_mae1 = []
    sd_mae2 = []
    sd_mre1 = []
    sd_mre2 = []
    med_mae1 = []
    med_mae2 = []
    med_mre1 = []
    med_mre2 = []
    cnt = 0

    if CVmethod == 'leaveOneOut':
        y_gnd = []
        y_lr = []

    for train_idx, test_idx in splt:
        cnt += 1
        if cnt > NUM_ITER:
            break
        f_train, f_test = f[train_idx, :], f[test_idx, :]
        Y_train, Y_test = Y[train_idx, :], Y[test_idx, :]

        # write file for invoking NB regression in R
        np.savetxt("Y_train.csv", Y_train, delimiter=",")
        np.savetxt("Y_test.csv", Y_test, delimiter=",")
        pd.DataFrame(f_train, columns=columnName).to_csv("f_train.csv",
                                                         sep=",",
                                                         index=False)
        pd.DataFrame(f_test, columns=columnName).to_csv("f_test.csv",
                                                        sep=",",
                                                        index=False)

        # NB regression
        nbres = subprocess.check_output(['Rscript',
                                         'nbr_eval_kfold.R']).split(" ")
        y1 = np.array([float(e) for e in nbres])
        y1 = y1.reshape((y1.shape[0], 1))
        a = np.abs(Y_test - y1)

        mae1.append(np.mean(a))
        sd_mae1.append(np.std(a))
        med_mae1 += a.tolist()
        r = a / Y_test
        mre1.append(np.mean(r))
        sd_mre1.append(np.std(r))
        med_mre1 += r.tolist()

        # Linear regression
        r2 = linearRegression(f_train, Y_train)
        y2 = r2.predict(f_test)
        y2 = y2.reshape((y2.shape[0], 1))
        ae = np.abs(Y_test - y2)
        mae2.append(np.mean(ae))
        sd_mae2.append(np.std(ae))
        med_mae2 += ae.tolist()
        re = ae / Y_test
        mre2.append(np.mean(re))
        sd_mre2.append(np.std(re))
        med_mre2 += re.tolist()

        if CVmethod == 'leaveOneOut':
            y_gnd.append(Y_test)
            y_lr.append(y2)

    if CVmethod == 'leaveOneOut':
        print np.mean(mae1), np.median(mae1), np.mean(mre1), np.median(mre1),
        print np.mean(mae2), np.median(mae2), np.mean(mre2), np.median(mre2)
        return y_gnd, y_lr
    else:
        print np.mean(mae1), np.mean(sd_mae1), np.median(med_mae1), np.mean(
            mre1), np.mean(sd_mre1), np.median(med_mre1),
        print np.mean(mae2), np.mean(sd_mae2), np.median(med_mae2), np.mean(
            mre2), np.mean(sd_mre2), np.median(med_mre2)

    return mae1, mae2
Пример #6
0
def tenFoldCV_onChicagoCrimeData(features=['corina'], CVmethod='10Fold', P = 10, NUM_ITER=20, SHUFFLE=True):
    """
    Use different years data to train the NB model
    """
    YEARS = range(2003, 2014)
    
    Y = []
    C = []
    FL = []
    GL = []
    T = []
    for year in YEARS:
        W = generate_transition_SocialLag(year, lehd_type=0)
        Yhat = retrieve_crime_count(year-1, ['total'])
        y = retrieve_crime_count(year, ['total'])
        c = generate_corina_features()
        popul = c[1][:,0].reshape((77,1))
        
        # crime count is normalized by the total population as crime rate
        # here we use the crime count per 10 thousand residents
        y = np.divide(y, popul) * 10000
        Yhat = np.divide(Yhat, popul) * 10000
        
        W2 = generate_geographical_SpatialLag_ca()
        
        f1 = np.dot(W, Yhat)
        f2 = np.dot(W2, Yhat)
        
        FL.append(f1)
        GL.append(f2)
        Y.append(y)
        T.append(Yhat)
        C.append(c[1])
    
    
    Y = np.concatenate(Y, axis=0)
    columnName = ['intercept']
    f = np.ones(Y.shape)
    if 'corina' in features:
        C = np.concatenate(C, axis=0)
        f = np.concatenate( (f, C), axis=1 )
        columnName += c[0]
    if 'sociallag' in features:
        FL = np.concatenate(FL, axis=0)
        f = np.concatenate( (f, FL), axis = 1)
        columnName += ['sociallag']
    if 'spatiallag' in features:
        GL = np.concatenate(GL, axis=0)
        f = np.concatenate((f, GL), axis=1)
        columnName += ['spatiallag']
    if 'temporallag' in features:
        T = np.concatenate(T, axis=0)
        f = np.concatenate((f, T), axis=1)
        columnName += ['temporallag']
    
    
    
    if SHUFFLE:
        f, Y = shuffle(f, Y)
    
    if CVmethod == '10Fold':
        splt = cross_validation.KFold(n=f.shape[0], n_folds=10, shuffle=True)
    elif CVmethod == 'leaveOneOut':
        splt = cross_validation.LeaveOneOut(n=f.shape[0])
    elif CVmethod == 'leavePOut':
        splt = cross_validation.LeavePOut(n=f.shape[0], p = P)
    
    mae1 = []
    mae2 = []
    mre1 = []
    mre2 = []
    sd_mae1 = []
    sd_mae2 = []
    sd_mre1 = []
    sd_mre2 = []
    med_mae1 = []
    med_mae2 = []
    med_mre1 = []
    med_mre2 = []
    cnt = 0
    
    if CVmethod == 'leaveOneOut':
        y_gnd = []
        y_lr = []


    for train_idx, test_idx in splt:
        cnt += 1
        if cnt > NUM_ITER:
            break
        f_train, f_test = f[train_idx, :], f[test_idx, :]
        Y_train, Y_test = Y[train_idx, :], Y[test_idx, :]
        

        # write file for invoking NB regression in R        
        np.savetxt("Y_train.csv", Y_train, delimiter=",")
        np.savetxt("Y_test.csv", Y_test, delimiter=",")        
        pd.DataFrame(f_train, columns = columnName).to_csv("f_train.csv", sep=",", index=False)
        pd.DataFrame(f_test, columns = columnName).to_csv("f_test.csv", sep=",", index=False)
        
        # NB regression 
        nbres = subprocess.check_output( ['Rscript', 'nbr_eval_kfold.R'] ).split(" ")
        y1 = np.array([float(e) for e in nbres])
        y1 = y1.reshape((y1.shape[0], 1))
        a = np.abs( Y_test - y1 )
        
        mae1.append(np.mean(a))
        sd_mae1.append(np.std(a))
        med_mae1 += a.tolist()
        r = a / Y_test
        mre1.append(np.mean(r))
        sd_mre1.append(np.std(r))
        med_mre1 += r.tolist()
        
        # Linear regression
        r2 = linearRegression(f_train, Y_train)
        y2 = r2.predict(f_test)
        y2 = y2.reshape((y2.shape[0], 1))
        ae = np.abs(Y_test - y2)
        mae2.append( np.mean(ae) )
        sd_mae2.append( np.std(ae) )
        med_mae2 += ae.tolist()
        re = ae / Y_test
        mre2.append( np.mean(re))
        sd_mre2.append( np.std(re) )
        med_mre2 += re.tolist()
        
        if CVmethod == 'leaveOneOut':
            y_gnd.append(Y_test)
            y_lr.append(y2)
    
    
    if CVmethod == 'leaveOneOut':
        print np.mean(mae1), np.median(mae1), np.mean(mre1), np.median(mre1),
        print np.mean(mae2), np.median(mae2), np.mean(mre2), np.median(mre2)
        return y_gnd, y_lr
    else:
        print np.mean(mae1), np.mean(sd_mae1), np.median(med_mae1), np.mean(mre1), np.mean(sd_mre1), np.median(med_mre1),
        print np.mean(mae2), np.mean(sd_mae2), np.median(med_mae2), np.mean(mre2), np.mean(sd_mre2), np.median(med_mre2)
        
    return mae1, mae2