Пример #1
0
def extracredit1():
    reader = csv.reader(open("data-classification-prob1.csv", 'rU'),
                        delimiter=',')
    data = []
    for row in reader:
        data.append([float(i) for i in row])
    d = np.array(data)
    learner = KNNLearner(27)
    learner = train(d[0:1000, :], learner)
    d = []
    step = 0.01
    for x1 in np.arange(-1, 1, step):
        for x2 in np.arange(-1, 1, step):
            d.append([x1, x2])
    d = np.array(d)
    sample = []
    for j, i in enumerate(d):
        if (j % 1000 == 0):
            print j
        sample.append(learner.query(i))
    fig = p.figure()
    ax = p3.Axes3D(fig)
    ax.scatter(d[:, 0], d[:, 1], sample, c='r', marker='o')
    ax.set_xlabel('X1')
    ax.set_ylabel('X2')
    ax.set_zlabel('Y')
    pp = PdfPages('3d_million_class_actual.pdf')
    pp.savefig()
    pp.close()
    p.show()
Пример #2
0
def test(filename):

    Xtrain, Ytrain, Xtest, Ytest = readCsvData(filename)
    Y = Ytest[:,0]
    sampleY = Ytrain[:,0]
    bestY = np.zeros([Ytest.shape[0]])
    
    knnTrainTime = np.zeros([100])
    knnQueryTime = np.zeros([100])
    knnCorrelation = np.zeros([100])
    knnRmsError = np.zeros([100])
    kArray = np.zeros([100])
    inSampleRmsErr = np.zeros([100])

    rfTrainTime = np.zeros([100])
    rfQueryTime = np.zeros([100])
    rfCorrelation = np.zeros([100])
    rfRmsError = np.zeros([100])
    
    #KNN Learner and RF Learner, k vary from 1 to 100
    for k in range(1, 101):
        #KNN
        kArray[k-1] = k
        
        learner = KNNLearner(k)
        learner.addEvidence(Xtrain, Ytrain)
        knnTest = learner.query(Xtest)
        knnY = knnTest[:,-1]

        #RMS Error(out-of-sample)
        knnRMS = calRMS(knnY, Y)

        #Correlation Coefficient
        knnCorr = calCorrcoef(knnY, Y)

        knnCorrelation[k-1] = knnCorr
        knnRmsError[k-1] = knnRMS

        #RF
        learner = RandomForestLearner(k)
        learner.addEvidence(Xtrain, Ytrain)
        rfTest = learner.query(Xtest)
        rfY = rfTest[:,-1]

        #RMS Error(out-of-sample)
        rfRMS = calRMS(rfY, Y)

        #Correlation Coefficient
        rfCorr = calCorrcoef(rfY, Y)

        rfCorrelation[k-1] = rfCorr
        rfRmsError[k-1] = rfRMS
        
    linename = ['KNN Learner', 'Random Forest Learner']
    createComparisonPlot('K value', 'RMS Error', kArray, knnRmsError, rfRmsError, 'RMSComparison.pdf', linename)
    linename = ['KNN Learner', 'Random Forest Learner']
    createComparisonPlot('K value', 'Correlation', kArray, knnCorrelation, rfCorrelation, 'CorrComparison.pdf', linename)
Пример #3
0
def main():

    #read the data 
    class_data = read_data("data-classification-prob.csv")
    ripple_data = read_data("data-ripple-prob.csv")

    #convert to numpy array
    class_arr = np.array(class_data, dtype = np.float)
    ripple_arr = np.array(ripple_data, dtype = np.float)

    #split the data into x and y
    class_x = class_arr[ : , : 2]
    class_y = class_arr[ : , 2]
    ripple_x = ripple_arr[ : , : 2]
    ripple_y = ripple_arr[ : , 2]

    #create the knn learner
    learner = KNNLearner(k = 3)
    learner.addEvidence(class_x, class_y)
    Y = learner.query(class_x[0 , :])
Пример #4
0
def wrap_up(symbol, start_date, end_date, out=False):
    dates = pd.date_range(start_date, end_date)
    data = get_data(symbol, dates, addSPY=False)
    data = data.dropna()
    vector_n = (5, 5, 5)
    df_data = create_train_data(data, vector_n)
    trainX = np.array(df_data[['X_1', 'X_2', 'X_3']])
    trainY = np.array(df_data[['Y']])

    dates_test = pd.date_range('2010-01-01', '2010-12-31')
    test_data = get_data(symbol, dates_test, addSPY=False)
    test_data = test_data.dropna()
    df_test_data = create_train_data(test_data, vector_n)
    testX = np.array(df_test_data[['X_1', 'X_2', 'X_3']])
    testY = np.array(df_test_data[['Y']])
    testY = testY[:, 0]

    learner = KNNLearner(3)
    #    learner = LinRegLearner()
    learner.addEvidence(trainX, trainY)  # train it

    # evaluate bin sample
    if out == False:
        predY = learner.query(trainX)  # get the predictions
        df_data['predY'] = predY
        return df_data
    else:
        predY = learner.query(testX)
        df_test_data['predY'] = predY
        return df_test_data
Пример #5
0
def extracredit2():
    reader = csv.reader(open("data-ripple-prob.csv", 'rU'), delimiter=',')
    data = []
    for row in reader:
        data.append([float(i) for i in row])
    d = np.array(data)
    learner = KNNLearner(27)
    learner = train(d[0:600, :], learner)
    insample = test(d[0:600, :], learner)
    outsample = test(d[600:1000, :], learner)
    print "RMS Error for KNN"
    print(rmserror(outsample, d[600:1000, 2]))
    print "RMS Error for RKNN"
    print(rmserror(insample, d[0:600, 2]))

    print i
    print "Error"
    print(rmserror(insample, d[0:600, 2]))
    print(rmserror(outsample, d[600:1000, 2]))
    print(np.corrcoef(outsample, d[600:1000, 2])[0][1])

    plt.scatter(
        d[600:1000, 2],
        outsample,
    )

    fig = p.figure()
    ax = p3.Axes3D(fig)
    ax.scatter(d[600:1000, 0],
               d[600:1000, 1],
               d[600:1000, 2],
               c='r',
               marker='o',
               label="Actual")
    ax.scatter(d[600:1000, 0],
               d[600:1000, 1],
               outsample,
               c='b',
               marker='o',
               label="Predicted")
    ax.set_xlabel('X1')
    ax.set_ylabel('X2')
    ax.set_zlabel('Y')
    red_patch = mpatches.Patch(color='red', label='Actual')
    blue_patch = mpatches.Patch(color='blue', label='Predicted')
    plt.legend(handles=[red_patch, blue_patch])
    pp = PdfPages('3d_ripple.pdf')
    pp.savefig()
    pp.close()
    p.show()
Пример #6
0
def knnlearner_test(filenames):
    for filename in filenames:
        rmse_series=[]
        covariance_series=[]
        for i in xrange(1,101):
            knnlearner=KNNLearner(k=i)
            get_set = knnlearner.getflatcsv(filename)
            get_set_60pr,get_set_40pr = numpy.split(get_set,[600])
            (X,Y) = numpy.split(get_set,[2],axis=1)
            (XTrain,XTest) = numpy.split(X,[600])
            (Ytrain,YTest) = numpy.split(Y,[600])
            knnlearner.build_hash(get_set_60pr)
            knnlearner.addEvidence(XTrain,Ytrain)
            query_X = numpy.array(XTest)
            (XY_return,Y_return) = knnlearner.query(XTest)
            Y_Test = np.squeeze(np.asarray(YTest))
            Y_Return = numpy.array(Y_return)
            rmse_series.append(get_rmse(Y_Test,Y_Return))
            covariance_series.append(get_correlation(Y_Test,Y_Return))
    return (rmse_series,covariance_series)
def main():

    isBagging = True

    file1 = "data-classification-prob.csv"
    file2 = "data-ripple-prob.csv"
    knn_rms1 = np.zeros((101, 1))
    knn_corrcoef1 = np.zeros((101, 1))

    knn_rms2 = np.zeros((101, 1))
    knn_corrcoef2 = np.zeros((101, 1))

    randomForest_rms1 = np.zeros((101, 1))
    randomForest_corrcoef1 = np.zeros((101, 1))

    randomForest_rms2 = np.zeros((101, 1))
    randomForest_corrcoef2 = np.zeros((101, 1))

    randomForestBagging_corrcoef1 = np.zeros((101, 1))
    randomForestBagging_corrcoef2 = np.zeros((101, 1))

    randomForestBagging_rms1 = np.zeros((101, 1))
    randomForestBagging_rms2 = np.zeros((101, 1))

    k = np.arange(1, 101)

    for i in range(1, 3):
        if i == 1:
            print 'Starting with dataset 1....'
            file = file1
        else:
            print 'Starting with dataset 2....'
            file = file2

        data = getflatcsv(file)

        XTrain = data[:(len(data) * 0.6), :(len(data[0]) - 1)]
        XTest = data[(len(data) * 0.6):, :(len(data[0]) - 1)]

        YTrain = data[:(len(data) * 0.6), -1]
        YTest = data[(len(data) * 0.6):, -1]
        if i == 1:
            YTest1 = YTest
        else:
            YTest2 = YTest

        for j in range(1, 3):
            if j == 1:
                print 'Calling KNNLearner for dataset %d...' % i
                for count in range(1, 101):
                    knnLearner = KNNLearner(k=count)
                    train_t = knnLearner.addEvidence(XTrain, YTrain)
                    Y, test_t = knnLearner.query(XTest)
                    if i == 1:
                        knn_rms1[count,
                                 0], knn_corrcoef1[count,
                                                   0] = getstats(Y, YTest)
                    else:
                        knn_rms2[count,
                                 0], knn_corrcoef2[count,
                                                   0] = getstats(Y, YTest)
            elif j == 2:
                print 'Calling RandomForestLearner for dataset %d...' % i
                for count in range(1, 101):
                    if isBagging:
                        randomForestLearner = RandomForestLearner(
                            k=count, isBagging=True)
                        randomForestLearner.addEvidence(XTrain, YTrain)
                        Y = randomForestLearner.query(XTest)
                        if i == 1:
                            randomForestBagging_rms1[
                                count, 0], randomForestBagging_corrcoef1[
                                    count, 0] = getstats(Y, YTest)
                            print count, randomForestBagging_corrcoef1[count,
                                                                       0]
                        else:
                            randomForestBagging_rms2[
                                count, 0], randomForestBagging_corrcoef2[
                                    count, 0] = getstats(Y, YTest)
                            print count, randomForestBagging_corrcoef2[count,
                                                                       0]

                    randomForestLearner = RandomForestLearner(k=count,
                                                              isBagging=False)
                    randomForestLearner.addEvidence(XTrain, YTrain)
                    Y = randomForestLearner.query(XTest)
                    if i == 1:
                        randomForest_rms1[count, 0], randomForest_corrcoef1[
                            count, 0] = getstats(Y, YTest)
                        print count, randomForest_corrcoef1[count, 0]
                    else:
                        randomForest_rms2[count, 0], randomForest_corrcoef2[
                            count, 0] = getstats(Y, YTest)
                        print count, randomForest_corrcoef2[count, 0]

    if isBagging:
        plt.ylabel('Random Forest:Corelation Coefficient - dataset 1')
        plt.xlabel('K')
        plt.legend(['Without Bagging', 'With Bagging'])
        plt.plot(k, randomForest_corrcoef1[1:], k,
                 randomForestBagging_corrcoef1[1:])
        plt.savefig('bagging_corr1.png')
        plt.close()

        plt.ylabel('Random Forest:Corelation Coefficient - dataset 2')
        plt.xlabel('K')
        plt.legend(['Without Bagging', 'With Bagging'])
        plt.plot(k, randomForest_corrcoef2[1:], k,
                 randomForestBagging_corrcoef2[1:])
        plt.savefig('bagging_corr2.png')
        plt.close()

    plt.ylabel('Corelation Coefficient - dataset 1')
    plt.xlabel('K')
    plt.legend(['KNN', 'Random Forest'])
    plt.plot(k, knn_corrcoef1[1:], k, randomForest_corrcoef1[1:])
    plt.savefig('corr1.png')
    plt.close()

    plt.ylabel('Corelation Coefficient - dataset 2')
    plt.xlabel('K')
    plt.legend(['KNN', 'Random Forest'])
    plt.plot(k, knn_corrcoef2[1:], k, randomForest_corrcoef2[1:])
    plt.savefig('corr2.png')
    plt.close()

    plt.ylabel('RMS - dataset 1')
    plt.xlabel('K')
    plt.legend(['KNN', 'Random Forest'])
    plt.plot(k, knn_rms1[1:], k, randomForest_rms1[1:])
    plt.savefig('Compare_RMS1.png')
    plt.close()

    plt.ylabel('RMS - dataset 2')
    plt.xlabel('K')
    plt.legend(['KNN', 'Random Forest'])
    plt.plot(k, knn_rms2[1:], k, randomForest_rms2[1:])
    plt.savefig('Compare_RMS2.png')
    plt.close()
Пример #8
0
number = 101
kArr = [i for i in range(1, number)]
for k in kArr:
    print "Training a new learner"
    learner = RandomForest(k)
    learner.addEvidence(Xtrain, Ytrain)
    Y_ltest = []
    Y_ltrain = []
    for x in Xtest:
        Y_ltest.append(learner.query(x))
    # for x in Xtrain:
    # 	Y_ltrain.append(learner.query(x))
    #rmsArr_train.append(math.sqrt(np.mean((np.array(Y_ltrain) - np.array(Ytrain)) ** 2)))
    rmsArr_test.append(
        math.sqrt(np.mean((np.array(Y_ltest) - np.array(Ytest))**2)))
    learner = KNNLearner(k)
    learner = train(d[0:600, :], learner)
    outsample = test(d[600:1000:], learner)
    rmsArr_k.append(rmserror(outsample, d[600:1000, 2]))

pp = PdfPages('Data-Classification-K-RandomForest.pdf')
plt.clf()
plt.ylabel('RMS')
plt.xlabel('k')
isE = plt.plot(range(1, number), rmsArr_k, 'g-', label='KNN')
osE = plt.plot(range(1, number), rmsArr_test, 'r-', label='Random Forest')
red_patch = mpatches.Patch(color='red', label='Random Forest')
green_patch = mpatches.Patch(color='green', label='KNN')
plt.legend(handles=[red_patch, green_patch])
plt.title("Data-Classification-K-RandomForest")
pp.savefig()
Пример #9
0
def test(filename):
    Xtrain, Ytrain, Xtest, Ytest = readCsvData(filename)
    Y = Ytest[:, 0]
    sampleY = Ytrain[:, 0]
    bestY = np.zeros([Ytest.shape[0]])

    trainTime = np.zeros([50])
    queryTime = np.zeros([50])
    correlation = np.zeros([50])
    rmsError = np.zeros([50])
    kArray = np.zeros([50])
    inSampleRmsErr = np.zeros([50])

    #KNN Learner, k vary from 1 to 50
    for k in range(1, 51):
        kArray[k - 1] = k

        learner = KNNLearner(k)

        knnTrainStime = time.time()
        learner.addEvidence(Xtrain, Ytrain)
        knnTrainEtime = time.time()

        knnQueryStime = time.time()
        knnTest = learner.query(Xtest)
        knnQueryEtime = time.time()
        knnY = knnTest[:, -1]

        #Avg Train Time per Instance
        avgKnnTrainTime = (knnTrainEtime - knnTrainStime) / Xtrain.shape[0]
        #Avg Query Time per Instance
        avgKnnQueryTime = (knnQueryEtime - knnQueryStime) / Xtest.shape[0]

        #RMS Error(out-of-sample)
        knnRMS = calRMS(knnY, Y)

        #In-sample RMS Error
        inSampleTest = learner.query(Xtrain)
        inSampleY = inSampleTest[:, -1]
        insampleRMS = calRMS(inSampleY, sampleY)

        #Correlation Coefficient
        knnCorr = calCorrcoef(knnY, Y)

        trainTime[k - 1] = avgKnnTrainTime
        queryTime[k - 1] = avgKnnQueryTime
        correlation[k - 1] = knnCorr
        rmsError[k - 1] = knnRMS
        inSampleRmsErr[k - 1] = insampleRMS

        if ((filename == 'data-classification-prob.csv') and (k == 27)):
            print k
            bestY = knnY
        elif ((filename == 'data-ripple-prob.csv') and (k == 3)):
            print k
            bestY = knnY

    createLinePlot('K value', 'Avg Train Time/Instance', kArray, trainTime,
                   'traintime.pdf', 'Average Train Time')
    createLinePlot('K value', 'Avg Query Time/Instance', kArray, queryTime,
                   'querytime.pdf', 'Average Query Time')
    createLinePlot('K value', 'Correlation', kArray, correlation,
                   'correlation.pdf',
                   'Correlation Coefficient of Predicted Y versus Actual Y')
    createLinePlot('K value', 'RMS Error', kArray, rmsError, 'rms.pdf',
                   'RMS Error between Predicted Y versus Actual Y')

    linename = ['Out-of-Sample Data', 'In-Sample Data']
    createComparisonPlot('K value', 'RMS Error', kArray, rmsError,
                         inSampleRmsErr, 'comparison.pdf', linename)

    createScatterPlot('Predicted Y', 'Actual Y', bestY, Y, 'bestK.pdf')

    #Linear Regression Learner
    learner = LinRegLearner()

    linTrainStime = time.time()
    learner.addEvidence(Xtrain, Ytrain)
    linTrainEtime = time.time()

    linQueryStime = time.time()
    linTest = learner.query(Xtest)
    linQueryEtime = time.time()
    linY = linTest[:, -1]

    #Avg Train Time per Instance
    avgLinTrainTime = (linTrainEtime - linTrainStime) / Xtrain.shape[0]
    #Avg Query Time per Instance
    avgLinQueryTime = (linQueryEtime - linQueryStime) / Xtest.shape[0]
    print avgLinTrainTime, avgLinQueryTime

    #RMS Error
    linRMS = calRMS(linY, Y)
    print linRMS

    #Correlation Coefficient
    linCorr = calCorrcoef(linY, Y)
    print linCorr
Пример #10
0
def extracredit3():
    print "Entering function"
    outsampleError = []
    outsampleError_R = []
    insampleError = []
    insampleError_R = []
    reader = csv.reader(open("data-classification-prob1.csv", 'rU'),
                        delimiter=',')
    data = []
    for row in reader:
        data.append([float(i) for i in row])
    d = np.array(data)
    number = 51
    for i in range(1, number):
        learner = KNNLearner(i)
        learner = train(d[0:600, :], learner)
        insample = test(d[0:600, :], learner)
        outsample = test(d[600:1000, :], learner)
        insampleError.append(rmserror(insample, d[0:600, 2]))
        outsampleError.append(rmserror(outsample, d[600:1000, 2]))

        learner = RKNNLearner(i)
        learner = train(d[0:600, :], learner)
        insample = test(d[0:600, :], learner)
        outsample = test(d[600:1000, :], learner)
        insampleError_R.append(rmserror(insample, d[0:600, 2]))
        outsampleError_R.append(rmserror(outsample, d[600:1000, 2]))
        k = 0
        for x, y in zip(outsample, d[600:1000, 2]):
            if (x == y):
                k += 1
        accuracy_plot.append(k / 400.0)
        #print i ,"has accuracy", k/400.0

    plt.plot(range(1, number), accuracy_plot)
    plt.show()
    print len(insampleError)
    pp = PdfPages('RKNN_vs_CKNN.pdf')
    isE = plt.plot(range(1, number),
                   insampleError,
                   'g-',
                   label='Insample error for KNN')
    osE = plt.plot(range(1, number),
                   outsampleError,
                   'r-',
                   label='Outsample Error for KNN')
    red_patch = mpatches.Patch(color='red', label='outsampleError for KNN')
    green_patch = mpatches.Patch(color='green', label='insampleError for KNN')
    isE_R = plt.plot(range(1, number),
                     insampleError_R,
                     'b-',
                     label='Insample error for classsifier KNN ')
    osE_R = plt.plot(range(1, number),
                     outsampleError_R,
                     'y-',
                     label='Outsample Error for classsifier KNN')
    blue_patch = mpatches.Patch(color='blue',
                                label='insampleError for classsifier KNN')
    yellow_patch = mpatches.Patch(color='yellow',
                                  label='outsampleError for classsifier KNN')

    plt.legend(handles=[red_patch, green_patch, blue_patch, yellow_patch])
    plt.ylabel("Error")
    plt.xlabel("K")
    plt.title("Vanilla KNN vs Classifier KNN")
    #plt.show()
    pp.savefig()
    pp.close()
    Best = min(enumerate(outsampleError_R), key=itemgetter(1))[0]
    print "Best ", Best
    print np.min(outsampleError_R)
    plt.show()
Пример #11
0
def test(filename):
    Xtrain, Ytrain, Xtest, Ytest = readCsvData(filename)
    Y = Ytest[:,0]
    sampleY = Ytrain[:,0]
    bestY = np.zeros([Ytest.shape[0]])
    
    trainTime = np.zeros([50])
    queryTime = np.zeros([50])
    correlation = np.zeros([50])
    rmsError = np.zeros([50])
    kArray = np.zeros([50])
    inSampleRmsErr = np.zeros([50])
    
    #KNN Learner, k vary from 1 to 50
    for k in range(1, 51):
        kArray[k-1] = k
        
        learner = KNNLearner(k)

        knnTrainStime = time.time()
        learner.addEvidence(Xtrain, Ytrain)
        knnTrainEtime = time.time()

        knnQueryStime = time.time()
        knnTest = learner.query(Xtest)
        knnQueryEtime = time.time()
        knnY = knnTest[:,-1]

        #Avg Train Time per Instance
        avgKnnTrainTime = (knnTrainEtime - knnTrainStime)/Xtrain.shape[0]
        #Avg Query Time per Instance
        avgKnnQueryTime = (knnQueryEtime - knnQueryStime)/Xtest.shape[0]

        #RMS Error(out-of-sample)
        knnRMS = calRMS(knnY, Y)

        #In-sample RMS Error
        inSampleTest = learner.query(Xtrain)
        inSampleY = inSampleTest[:,-1]
        insampleRMS = calRMS(inSampleY, sampleY)

        #Correlation Coefficient
        knnCorr = calCorrcoef(knnY, Y)

        trainTime[k-1] = avgKnnTrainTime
        queryTime[k-1] = avgKnnQueryTime
        correlation[k-1] = knnCorr
        rmsError[k-1] = knnRMS
        inSampleRmsErr[k-1] = insampleRMS

        if((filename == 'data-classification-prob.csv') and (k == 27)):
            print k
            bestY = knnY
        elif((filename == 'data-ripple-prob.csv') and (k == 3)):
            print k
            bestY = knnY

    createLinePlot('K value', 'Avg Train Time/Instance', kArray, trainTime, 'traintime.pdf', 'Average Train Time')
    createLinePlot('K value', 'Avg Query Time/Instance', kArray, queryTime, 'querytime.pdf', 'Average Query Time')
    createLinePlot('K value', 'Correlation', kArray, correlation, 'correlation.pdf', 'Correlation Coefficient of Predicted Y versus Actual Y')
    createLinePlot('K value', 'RMS Error', kArray, rmsError, 'rms.pdf', 'RMS Error between Predicted Y versus Actual Y')

    linename = ['Out-of-Sample Data', 'In-Sample Data']
    createComparisonPlot('K value', 'RMS Error', kArray, rmsError, inSampleRmsErr, 'comparison.pdf', linename)

    createScatterPlot('Predicted Y', 'Actual Y', bestY, Y, 'bestK.pdf')

    #Linear Regression Learner
    learner = LinRegLearner()

    linTrainStime = time.time()
    learner.addEvidence(Xtrain, Ytrain)
    linTrainEtime = time.time()

    linQueryStime = time.time()
    linTest = learner.query(Xtest)
    linQueryEtime = time.time()
    linY = linTest[:,-1]

    #Avg Train Time per Instance
    avgLinTrainTime = (linTrainEtime - linTrainStime)/Xtrain.shape[0]
    #Avg Query Time per Instance
    avgLinQueryTime = (linQueryEtime - linQueryStime)/Xtest.shape[0]
    print avgLinTrainTime, avgLinQueryTime

    #RMS Error
    linRMS = calRMS(linY, Y)
    print linRMS

    #Correlation Coefficient
    linCorr = calCorrcoef(linY, Y)
    print linCorr
Пример #12
0
def main():
    
    isBagging = True
    
    file1 = "data-classification-prob.csv"
    file2 = "data-ripple-prob.csv"
    knn_rms1 = np.zeros((101,1))
    knn_corrcoef1 = np.zeros((101,1))
    
    knn_rms2 = np.zeros((101,1))
    knn_corrcoef2 = np.zeros((101,1))
    
    randomForest_rms1 = np.zeros((101,1))
    randomForest_corrcoef1 = np.zeros((101,1))
    
    randomForest_rms2 = np.zeros((101,1))
    randomForest_corrcoef2 = np.zeros((101,1))
    
    randomForestBagging_corrcoef1 = np.zeros((101,1))
    randomForestBagging_corrcoef2 = np.zeros((101,1))
    
    randomForestBagging_rms1 = np.zeros((101,1))
    randomForestBagging_rms2 = np.zeros((101,1))
    
    k = np.arange(1,101)
    
    for i in range(1,3):
        if i == 1:
            print 'Starting with dataset 1....'
            file = file1
        else:
            print 'Starting with dataset 2....'
            file = file2
    
        data = getflatcsv(file)

        XTrain = data[:(len(data)*0.6),:(len(data[0])-1)]
        XTest = data[(len(data)*0.6):,:(len(data[0])-1)]
    
        YTrain = data[:(len(data)*0.6),-1]
        YTest = data[(len(data)*0.6):,-1]
        if i == 1:
            YTest1 = YTest
        else:
            YTest2 = YTest
            
        for j in range(1,3):
            if j == 1:
                print 'Calling KNNLearner for dataset %d...' % i
                for count in range(1,101):
                    knnLearner = KNNLearner(k=count)
                    train_t = knnLearner.addEvidence(XTrain, YTrain)
                    Y, test_t = knnLearner.query(XTest)
                    if i == 1:
                        knn_rms1[count,0], knn_corrcoef1[count,0] = getstats(Y, YTest)
                    else:
                        knn_rms2[count,0], knn_corrcoef2[count,0] = getstats(Y, YTest)
            elif j == 2:
                print 'Calling RandomForestLearner for dataset %d...' % i
                for count in range(1,101):
                    if isBagging:
                        randomForestLearner = RandomForestLearner(k=count, isBagging = True)
                        randomForestLearner.addEvidence(XTrain, YTrain)
                        Y = randomForestLearner.query(XTest)
                        if i == 1:
                            randomForestBagging_rms1[count,0], randomForestBagging_corrcoef1[count,0] = getstats(Y, YTest)
                            print count, randomForestBagging_corrcoef1[count,0]
                        else:
                            randomForestBagging_rms2[count,0], randomForestBagging_corrcoef2[count,0] = getstats(Y, YTest)
                            print count, randomForestBagging_corrcoef2[count,0]
                
                    randomForestLearner = RandomForestLearner(k=count, isBagging = False)
                    randomForestLearner.addEvidence(XTrain, YTrain)
                    Y = randomForestLearner.query(XTest)
                    if i == 1:
                        randomForest_rms1[count,0], randomForest_corrcoef1[count,0] = getstats(Y, YTest)
                        print count, randomForest_corrcoef1[count,0]
                    else:
                        randomForest_rms2[count,0], randomForest_corrcoef2[count,0] = getstats(Y, YTest)
                        print count, randomForest_corrcoef2[count,0]
                
    if isBagging:
        plt.ylabel('Random Forest:Corelation Coefficient - dataset 1')
        plt.xlabel('K')
        plt.legend(['Without Bagging','With Bagging'])
        plt.plot(k, randomForest_corrcoef1[1:], k, randomForestBagging_corrcoef1[1:]);
        plt.savefig('bagging_corr1.png')
        plt.close()
                                
        plt.ylabel('Random Forest:Corelation Coefficient - dataset 2')
        plt.xlabel('K')
        plt.legend(['Without Bagging','With Bagging'])
        plt.plot(k, randomForest_corrcoef2[1:], k, randomForestBagging_corrcoef2[1:]);
        plt.savefig('bagging_corr2.png')
        plt.close()

    plt.ylabel('Corelation Coefficient - dataset 1')
    plt.xlabel('K')
    plt.legend(['KNN','Random Forest'])
    plt.plot(k, knn_corrcoef1[1:], k, randomForest_corrcoef1[1:]);
    plt.savefig('corr1.png')
    plt.close()

    plt.ylabel('Corelation Coefficient - dataset 2')
    plt.xlabel('K')
    plt.legend(['KNN','Random Forest'])
    plt.plot(k, knn_corrcoef2[1:], k, randomForest_corrcoef2[1:]);
    plt.savefig('corr2.png')
    plt.close()

    plt.ylabel('RMS - dataset 1')
    plt.xlabel('K')
    plt.legend(['KNN','Random Forest'])
    plt.plot(k, knn_rms1[1:], k, randomForest_rms1[1:])
    plt.savefig('Compare_RMS1.png')
    plt.close()

    plt.ylabel('RMS - dataset 2')
    plt.xlabel('K')
    plt.legend(['KNN','Random Forest'])
    plt.plot(k, knn_rms2[1:], k, randomForest_rms2[1:])
    plt.savefig('Compare_RMS2.png')
    plt.close()
Пример #13
0
def knnlearner_test(filenames):
    for filename in filenames:
        train_time =[]
        query_time =[]
        rmse_series=[]
        rmse_series_insample=[]
        covariance_series=[]
        for i in xrange(1,51):
            knnlearner=KNNLearner(k=i)
            get_set = knnlearner.getflatcsv(filename)
            get_set_60pr,get_set_40pr = numpy.split(get_set,[600])
            (X,Y) = numpy.split(get_set,[2],axis=1)
            (XTrain,XTest) = numpy.split(X,[600])
            (Ytrain,YTest) = numpy.split(Y,[600])
            knnlearner.build_hash(get_set_60pr)
            with Timer() as t:
                knnlearner.addEvidence(XTrain,Ytrain)
            train_time.append(t.interval)
            query_X = numpy.array(XTest)
            with Timer() as t:
                (XY_return,Y_return) = knnlearner.query(XTest)
            query_time.append(t.interval)
            Y_Test = np.squeeze(np.asarray(YTest))
            Y_Return = numpy.array(Y_return)
            rmse_series.append(get_rmse(Y_Test,Y_Return))
            (XY_return_insample,Y_return_insample) = knnlearner.query(XTrain)
            Y_Train = np.squeeze(np.asarray(Ytrain))
            Y_return_insample = numpy.array(Y_return_insample)
            rmse_series_insample.append(get_rmse(Y_Train,Y_return_insample))
            covariance_series.append(get_correlation(Y_Test,Y_Return))
        min_rmse = min(float(i) for i in rmse_series)
        k_index = rmse_series.index(min_rmse)
        print "best k = ",k_index+1," for ",filename
        knnlearner_scatter = KNNLearner(k=k_index+1)
        get_set = knnlearner_scatter.getflatcsv(filename)
        get_set_60pr,get_set_40pr = numpy.split(get_set,[600])
        (X,Y) = numpy.split(get_set,[2],axis=1)
        (XTrain,XTest) = numpy.split(X,[600])
        (Ytrain,YTest) = numpy.split(Y,[600])
        knnlearner_scatter.build_hash(get_set_60pr)
        knnlearner_scatter.addEvidence(XTrain,Ytrain)
        (XY_return,Y_return) = knnlearner_scatter.query(XTest)
        Y_Test = np.squeeze(np.asarray(YTest))
        Y_Return = numpy.array(Y_return)
        scatter(Y_Return,Y_Test,"scatterplot("+filename+")(for bestk).pdf")
        get_graph(numpy.arange(1,51),train_time,"K","Train time in seconds","KNN_Train_time("+filename+").pdf",4)
        get_graph(numpy.arange(1,51),query_time,"K","Query time in seconds","KNN_Query_time("+filename+").pdf",4)
        get_graph(numpy.arange(1,51),rmse_series,"K","RMSE Error","RMSEvsk("+filename+").pdf")
        get_graph(numpy.arange(1,51),covariance_series,"K","Covariance Coefficeint","Covariance Coeff vs K("+filename+").pdf")
        get_graph_two_plots(numpy.arange(1,51),rmse_series_insample,rmse_series,"K","RMSE","insample_error_vs_outsample_error("+filename+").pdf")