def test_knn():
    train_data = build_RL_data()
    test_data = build_RL_data(sd=dt.datetime(2010, 1, 1),
                              ed=dt.datetime(2011, 12, 31))

    build_orders(learner=knn.KNNLearner(k=10), filename='knn_10.csv')
    build_orders(learner=knn.KNNLearner(k=20), filename='knn_20.csv')
    build_orders(learner=knn.KNNLearner(k=30), filename='knn_30.csv')
    build_orders(learner=knn.KNNLearner(k=40), filename='knn_40.csv')
    build_orders(learner=knn.KNNLearner(k=50), filename='knn_50.csv')
def chartOne():
    # Get data
    dates = pd.date_range('2007-12-31', '2009-12-31')
    symbols = ['ML4T-399']
    prices = get_data(symbols, dates)
    df = getAllData(symbols, dates)

    #Get the x train and the y train values
    xTrain = getXTrain(df)
    yTrain = getYTrain(df)

    #Create the learner
    learner = knn.KNNLearner(k=3)

    #Train the learner
    learner.addEvidence(xTrain.values, yTrain.values)

    #Query the learner
    predicted = learner.query(xTrain.values)
    predictedDf = pd.DataFrame(data=predicted,
                               index=prices.index,
                               columns=['PRED'])
    dfYVals = df['YVal']

    pricesToUse = prices['ML4T-399']
    trainingToUse = pricesToUse * (1 + dfYVals)
    predictedToUse = pricesToUse * (1 + predictedDf)

    allToChart = pd.DataFrame(index=prices.index,
                              columns=['PRICES', 'TRAINING_Y', 'PREDICTED_Y'])
    allToChart['PRICES'] = pricesToUse
    allToChart['TRAINING_Y'] = trainingToUse
    allToChart['PREDICTED_Y'] = predictedToUse
    plot_chart(allToChart, "PRICES, PREDICTED, AND TRAINING DATA FOR ML4T-399")
Exemplo n.º 3
0
def applyStrategy():
    stock = "IBM"
    train_features = get_Features('2008-1-1', '2009-12-31', stock)
    learner = knn.KNNLearner(3)
    learner.addEvidence(
        train_features[["BB", "Momentum", "Volatility"]].values,
        train_features["Training Y"])

    test_features = get_Features('2010-1-1', '2010-12-31', stock)
    test_features["Predicted Y"] = learner.query(
        test_features[["BB", "Momentum", "Volatility"]].values)

    test_features[["Volatility"]].plot()
    plt.show()

    longbuy = False
    shortbuy = False
    count = 0
    orders = []
    datelen = len(test_features)
    for i in range(19, datelen - 1):
        date = test_features.index[i]
        if (test_features.ix[i, "Predicted Y"] > 0):
            if (longbuy == False and shortbuy == False):
                count += 1
                plt.axvline(test_features.index[i + 1], color='g')
                orders.append([test_features.index[i + 1], stock, "BUY", 100])
                longbuy = True

        if (test_features.ix[i, "Predicted Y"] < 0):
            if (longbuy == True):
                count += 1
                plt.axvline(test_features.index[i + 1], color='black')
                orders.append([test_features.index[i + 1], stock, "SELL", 100])
                longbuy = False
                continue

        if (test_features.ix[i, "Predicted Y"] < 0):
            if (longbuy == False and shortbuy == False):
                count += 1
                plt.axvline(test_features.index[i + 1], color='r')
                orders.append([test_features.index[i + 1], stock, "SELL", 100])
                shortbuy = True

        if (test_features.ix[i, "Predicted Y"] > 0):
            if (shortbuy == True):
                count += 1
                plt.axvline(test_features.index[i + 1], color='black')
                orders.append([test_features.index[i + 1], stock, "BUY", 100])
                shortbuy = False

    #now perform analysis on open/close based on date, etc..
    ordersnp = np.array(orders)
    ordersDataFrame = pd.DataFrame(ordersnp[0:, 1:], index=ordersnp[0:, 0])

    ordersDataFrame.columns = ['Symbol', 'Order', 'Shares']
    ordersDataFrame.index.name = 'Date'
    ordersDataFrame.to_csv("C:\Users\Nilav\Documents\ml_mc3_p2_IBM_test.csv")
    test_features[stock].plot(color='c')
    plt.show()
def inSampleIbmTest():
    #Get the data
    dates = pd.date_range('2007-12-31', '2009-12-31')
    symbols = ['IBM']
    prices = get_data(symbols, dates)
    df = getAllData(symbols, dates)

    #Get the xTrain and yTrain values
    xTrain = getXTrain(df)
    yTrain = getYTrain(df)

    #Create the learner
    learner = knn.KNNLearner(k=3)

    #Train the learner
    learner.addEvidence(xTrain.values, yTrain.values)

    #Query the learner ON THE IN SAMPLE DATA
    predicted = learner.query(xTrain.values)
    predictedDf = pd.DataFrame(data=predicted,
                               index=prices.index,
                               columns=['PRED'])

    #Call the strategy. Args to pass it are:
    #The df with the prices data for the test year, for the plot
    #The df with the predictions for the test year, for the strategy
    #The symbol
    myStrategy(df, predictedDf, "IBM", "inSampleIbm.csv", "IN SAMPLE IBM")
Exemplo n.º 5
0
def predict_for(input_file, num_to_predict=60):

    leaner = knn.KNNLearner(verbose=False)

    data = [convert_line(line) for line in open(input_file, 'r').readlines()]

    x = data[-1][0]
    y = data[-1][1]
    dist = get_dist(data[-2], data[-1])
    angle = get_angle(data[-2], data[-1])

    k = 5
    #print "starting loc:", x, ",", y
    preds = []

    for i in range(num_to_predict):
        #print "Input:", x, y
        x, y, angle, dist = leaner.predict(k, x, y, angle, dist)

        #print "X:", x, "Y:", y, "Angle:", angle, "dist:", dist
        preds.append((x, y))

    with open("prediction.txt", "w") as f:

        for i in range(len(preds)):
            f.write(str(int(preds[i][0])) + "," + str(int(preds[i][1])) + "\n")

        f.close()
def outOfSampleIbmTest():
    # Get data for 2008-2009 training
    dates = pd.date_range('2007-12-31', '2009-12-31')
    symbols = ['IBM']
    prices = get_data(symbols, dates)
    df = getAllData(symbols, dates)

    #Get data for 2010 testing. Need the correct index, and the prices in order to plot it.
    testDates = pd.date_range('2009-12-31', '2010-12-31')
    df2010Dates = get_data(symbols, testDates)
    df2010 = getAllData(symbols, testDates)
    correctIndex = df2010Dates.index

    #Get the x train and the y train values from the 2008-2009 dataframe
    xTrain = getXTrain(df)
    yTrain = getYTrain(df)

    #Create the learner
    learner = knn.KNNLearner(k=3)

    #Train the learner on the 2008-2009 values
    learner.addEvidence(xTrain.values, yTrain.values)

    #Get query data for 2010
    xTest = getXTrain(df2010)

    #Query the learner
    predicted = learner.query(xTest.values)
    predictedDf = pd.DataFrame(data=predicted,
                               index=correctIndex,
                               columns=['PRED'])

    #Run the strategy
    myStrategy(df2010, predictedDf, "IBM", "outOfSampleIBM.csv",
               "OUT OF SAMPLE IBM")
Exemplo n.º 7
0
def run_test(file_num):

    leaner = knn.KNNLearner(verbose=False)

    input_file = 'Inputs/test%02d.txt' % file_num
    data = [convert_line(line) for line in open(input_file, 'r').readlines()]

    num_test = 120

    x = data[0][0]
    y = data[0][1]
    dist = get_dist(data[0], data[1])
    angle = get_angle(data[0], data[1])

    k = 7
    print "starting loc:", x, ",", y
    preds = []

    for i in range(num_test):

        print "Input:", x, y
        x, y, angle, dist = leaner.predict(k, x, y, angle, dist)

        print "X:", x, "Y:", y, "Angle:", angle, "dist:", dist
        preds.append((x, y))

    acts = np.array(data[0:num_test])
    targets = np.array(preds)

    plt.scatter(acts[:, 0], acts[:, 1], color='green')
    plt.scatter(targets[:, 0], targets[:, 1], color='red')
    plt.show()
Exemplo n.º 8
0
def predict_outsamp(X_trn, y_trn, X_tst, y_tst, symbol, start_date, end_date, k):
    # create a linear regression learner and train it
    lrlearner = lrl.LinRegLearner() # create a LinRegLearner
    lrlearner.addEvidence(X_trn, y_trn) # train it
    ytst_lr = lrlearner.query(X_tst)

    # create a KNN learner (k=10) and train it    
    knnlearn = knn.KNNLearner(k) # constructor
    knnlearn.addEvidence(X_trn, y_trn) # training step
    ytst_knn = knnlearn.query(X_tst)    

    # create a Bag learner and train it    
    baglearn = bl.BagLearner(learner = knn.KNNLearner, kwargs = {"k":k}, bags = 100, boost = False) # constructor
    baglearn.addEvidence(X_trn, y_trn) # training step
    ytst_bag = baglearn.query(X_tst)

    # Combine all models
    combined = (ytst_lr+ytst_knn+ytst_bag)/3
    
    print ""
    print "Out of sample predictions for %s data from %s to %s"  %(symbol[0], start_date, end_date)
    print "KNN RMSE %0.4f; LinReg RMSE %0.4f; BagReg RMSE %0.4f; Combined RMSE %0.4f" %(rmse(y_tst, ytst_knn), rmse(y_tst, ytst_lr), rmse(y_tst, ytst_bag), rmse(y_tst, combined))
    print "KNN corr %0.4f; LinReg corr %0.4f; BagReg corr %0.4f" %(np.corrcoef(y_tst, ytst_knn)[0,1], np.corrcoef(y_tst, ytst_lr)[0,1], np.corrcoef(y_tst, ytst_bag)[0,1])
    print "KNN mean %0.4f; LinReg mean %0.4f; BagReg mean %0.4f" %(abs(y_tst - ytst_knn).mean(), abs(y_tst - ytst_lr).mean(), abs(y_tst - ytst_bag).mean())
    print "Actual mean 5 day change %0.4f" %abs(y_tst).mean()
    print ""
    return ytst_lr, ytst_knn, ytst_bag
Exemplo n.º 9
0
def test():
    inf = open('Data/best4KNN.csv')
    data = np.array(
        [map(float,
             s.strip().split(',')) for s in inf.readlines()])

    # compute how much of the data is training and testing
    train_rows = math.floor(0.6 * data.shape[0])
    test_rows = data.shape[0] - train_rows

    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]

    # create a learner and train it
    learner = lrl.LinRegLearner()  # create a LinRegLearner
    learner.addEvidence(trainX, trainY)  # train it

    # evaluate in of sample
    predYLRL = learner.query(trainX)  # get the predictions
    rmse = math.sqrt(((trainY - predYLRL)**2).sum() / trainY.shape[0])
    print
    print "Linear Regression Learner"
    print "In sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(predYLRL, y=trainY)
    print "corr: ", c[0, 1]

    # evaluate out of sample
    predYLRL = learner.query(testX)  # get the predictions
    rmse = math.sqrt(((testY - predYLRL)**2).sum() / testY.shape[0])
    print
    print "Out of sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(predYLRL, y=testY)
    print "corr: ", c[0, 1]

    learner = knn.KNNLearner(k=3)  # constructor
    learner.addEvidence(trainX, trainY)  # training step

    predYKNN = learner.query(trainX)  # query
    rmse = math.sqrt(((trainY - predYKNN)**2).sum() / trainY.shape[0])
    print
    print "KNN Learner"
    print "In sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(predYKNN, y=trainY)
    print "corr: ", c[0, 1]

    predYKNN = learner.query(testX)  # query
    rmse = math.sqrt(((testY - predYKNN)**2).sum() / testY.shape[0])
    print
    print "Out of sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(predYKNN, y=testY)
    print "corr: ", c[0, 1]
Exemplo n.º 10
0
    def t(self):
        sknn = neighbors.KNeighborsRegressor(k)
        learner = knn.KNNLearner(k, False)  #learner that for class
        ly = learner.addEvidence(self.xtrain, self.ytrain)

        sy = sknn.fit(self.xtrain, self.ytrain).predict(self.xtest)
        y = learner.query(self.xtest)
        rmse = np.linalg.norm(sy - y) / np.sqrt(len(sy))
        self.assertLess(rmse, 1e-7)
Exemplo n.º 11
0
def SendtoModel(train_df, train_price, test_df, test_price, model='knn', symbol='IBM', k=3, bags=0, verbose=False):
    """
    Sends test and train data frame to selected model
    Returns predicted test Y and train Y
    """

    #calculate training and test sets
    trainX = np.array(train_df.iloc[0:,0:-1])
    trainY = np.array(train_df.iloc[0:,-1])
    testX = np.array(test_df.iloc[0:,0:-1])
    testY = np.array(test_df.iloc[0:,-1])

    print 'shape testX', testX.shape
    print 'shape testY', testY.shape

    if model == 'knn':
        learner = knn.KNNLearner(k=k, verbose=True) # create a knnLearner
        learner.addEvidence(trainX, trainY) # train it

        # evaluate in sample
        predY_train = learner.query(trainX) # get the predictions
        rmse_train = math.sqrt(((trainY - predY_train) ** 2).sum()/trainY.shape[0])

        # evaluate out of sample
        predY_test = learner.query(testX) # get the predictions
        rmse_test = math.sqrt(((testY - predY_test) ** 2).sum()/testY.shape[0])

        #output graphs - normalized lines
        # plot_lines_data(price_norm=train_price, actualY=train_df.iloc[0:,-1], predY=pd.Series(predY_train, index=train_df.index),
        #           name='%s_in_sample_%s' % (symbol[0], model))
        # plot_lines_data(price_norm=test_price, actualY=test_df.iloc[0:,-1], predY=pd.Series(predY_test, index=test_df.index),
        #           name='%s_out_sample_%s' % (symbol[0], model))


        if verbose:
            #(a) in sample results
            print model, 'with arguments k=%s, bags=%s' % (k, bags)
            print "In sample results"
            print "RMSE: ", rmse_train
            c = np.corrcoef(predY_train, y=trainY)
            print "corr: ", c[0,1]

            #(b) out of sample results
            print
            print model, 'with arguments k=%s, bags=%s' % (k, bags)
            print "Out of sample results"
            print "RMSE: ", rmse_test
            c = np.corrcoef(predY_test, y=testY)
            print "corr: ", c[0,1]
            print 'length of predicted values: ', len(predY_test)
            #print 'print predicted Y values:', predY_test

        else:
            pass

        return predY_train, predY_test
Exemplo n.º 12
0
def test(learner=knn.KNNLearner(3),
         symb='IBM',
         train_sd=dt.datetime(2007, 12, 31),
         train_ed=dt.datetime(2009, 12, 31),
         test_sd=dt.datetime(2007, 12, 31),
         test_ed=dt.datetime(2009, 12, 31)):

    #create a learner
    learner = learner

    #generate training dataset
    df, trainX, trainY = getData(train_sd, train_ed, symb)
    #add evidence
    learner.addEvidence(trainX, trainY)
    #generate testing dataset
    df2, testX, testY = getData(test_sd, test_ed, symb)
    #use learner to predict value
    predY = learner.query(testX)

    #plot correlations between test and predict data, and calculate their correlation
    plot_corr(testY, predY)
    c = np.corrcoef(predY, y=testY)
    print "corr: ", c[0, 1]

    #generate plot for current price, train price and predict price
    df2['pred'] = predY
    df2['fv'] = df2[symb] * (1 + df2['fr'])
    df2['pv'] = df2[symb] * (1 + df2['pred'])
    plot_compare(df2[symb], df2['fv'], df2['pv'])

    #use predict data to execute orders and plot long, short, exit as vertical lines
    le, se, s = operations(df2, symb)
    plot_trade(df2[symb], symb, le, se, s)

    #computer portvals plot backtest results
    portvals = compute_portvals("orders.csv", start_val=10000)
    norm_SPY = df2['SPY'] / df2['SPY'][0] * 10000
    plot_bt(portvals, symb, norm_SPY)

    #analysis of portfolio
    cr, adr, sddr, sr = analysis(portvals)
    print cr, adr, sddr, sr, portvals[-1]
    price_mean = priceC.mean().values
    price_std = priceC.std().values
    temp_train = priceC.rename(columns={'ML4T-240': 'price_of_stock'})
    temp_train = (priceC.rename(columns={'ML4T-240': 'price_of_stock_norm'}) -
                  price_mean) / price_std
    #position 4
    temp_test = priceC.rename(columns={'ML4T-240': 'price_of_stock'})
    #temp_test = (priceC.rename(columns={'ML4T-240':'price_of_stock_norm'})-price_mean)/ price_std

    train_dataX = get_trainX()[:-5]
    train_dataY = get_trainY()[20:]
    test_dataX = get_testX()[:-5]
    test_dataY = get_testY()[20:]

    #create learner
    learner = knn.KNNLearner(k=3, verbose=False)
    learner.addEvidence(train_dataX, train_dataY)
    '''train data'''
    #get df predY_train
    predY_train = learner.query(train_dataX.values)
    predY_train_df = train_dataY.copy(deep=True)

    predY_train_df[:] = predY_train

    predY_train_df = predY_train_df.to_frame()
    predY_train_df = predY_train_df.rename(columns={'ML4T-240': 'PredY_train'})

    #get df Y_train
    train_dataY_df = pd.DataFrame(train_dataY)
    train_dataY_df = train_dataY_df.rename(columns={'ML4T-240': 'Y_train'})
Exemplo n.º 14
0
def assess_portfolio(start_date, end_date, symbols):
    """Simulate and assess the performance of a stock portfolio."""
    # Read in adjusted closing prices for given symbols, date range
    pd.set_option('display.expand_frame_repr', False)
    dates = pd.date_range(start_date, end_date)
    prices_all = get_data(symbols, dates)  # automatically adds SPY
    prices_all_temp = prices_all
    prices = prices_all[symbols]  # only portfolio symbols

    prices_all_temp = prices_all
    prices_all_temp = calculate(prices_all_temp, symbols, prices)
    prices_all_temp = shiftY(False,prices_all_temp,symbols)
    prices_all_test = prices_all_temp.ix['2009-12-31':'2010-12-31']
    prices_all_temp = prices_all_temp.ix['2007-12-31':'2008-12-31']

    """
    prices_all_test = prices_all_temp.ix['2008-12-31':'2009-12-31']

    prices_all_test = calculate(prices_all_test, symbols, prices)
    prices_all_test = shiftY(False,prices_all_test,symbols)
    prices_all_test = prices_all_test.fillna(method='bfill')
    prices_all_test = prices_all_test.fillna(method='ffill')

    prices_all_temp = prices_all_temp.ix['2007-12-31':'2008-12-31']
    prices_all_temp = calculate(prices_all_temp, symbols, prices)
    prices_all_temp = shiftY(False,prices_all_temp,symbols)
    prices_all_temp = prices_all_temp.fillna(method='bfill')
    prices_all_temp = prices_all_temp.fillna(method='ffill')
    """

    # separate out training and testing data
    #print prices_all_temp
    trainX = prices_all_temp.values[:,1:-2]
    trainY = prices_all_temp.values[:,-2]

    testX = prices_all_test.values[:,1:-2]
    testY = prices_all_test.values[:,-2]

    #print trainX

    #learner = bl.BagLearner(learner = knn.KNNLearner, kwargs = {"k":3}, bags = 50, boost = False)
    #learner.addEvidence(trainX, trainY)

    learner = knn.KNNLearner(3)
    learner.addEvidence(trainX, trainY) # train it

    #learner = lrl.LinRegLearner()
    #learner.addEvidence(trainX, trainY) # train it

    predY = learner.query(trainX) # get the predictons
    generateOrders(prices_all_temp,predY, symbols)
    rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])

    print "---------------- ----- ---------------"

    print "In sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(predY, y=trainY)
    print "corr: ", c[0,1]

    #exit()

    predY = learner.query(testX) # get the predictions
    generateOrders(prices_all_test,predY, symbols)
    rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])

    print "Out of sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(predY, y=testY)
    print "corr: ", c[0,1]
Exemplo n.º 15
0
    # compute how much of the data is training and testing
    train_rows = math.floor(0.6 * data.shape[0])
    test_rows = data.shape[0] - train_rows

    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]

    #start my test
    import KNNLearner as knn
    knnktest = []
    for r in range(1, 101):
        learner = knn.KNNLearner(k=r)  # constructor
        learner.addEvidence(trainX, trainY)  # training step
        predYKNNTrain = learner.query(trainX)  # query
        rmseTrain = math.sqrt(
            ((trainY - predYKNNTrain)**2).sum() / trainY.shape[0])
        print
        print "KNN Learner"
        print "In sample results"
        print "RMSE: ", rmseTrain
        cTrain = np.corrcoef(predYKNNTrain, y=trainY)
        print "corr: ", cTrain[0, 1]

        learner = knn.KNNLearner(k=r)  # constructor
        learner.addEvidence(trainX, trainY)  # training step
        predYKNN = learner.query(testX)  # query
        rmse = math.sqrt(((testY - predYKNN)**2).sum() / testY.shape[0])
Exemplo n.º 16
0
    """

    file_num = 1
    df = pd.read_csv('data/data%02d.txt' % file_num, index_col=0)
    return df


if __name__ == "__main__":

    #process_training_data()
    #normalize_training_data()
    #df = load_test_data()

    #print df

    learner = knn.KNNLearner()
    """
    ,x,y,angle,dist,drift,xp,yp,x_drift,y_drift,dest_x,dest_y
    2,278,64,0.343023940421,14.8660687473,7.21110255093,272,60,-6.0,-4.0,282,60
    3,282,60,-0.785398163397,5.65685424949,13.4536240471,292,69,10.0,9.0,296,67
    4,296,67,0.463647609001,15.6524758425,14.8660687473,286,56,-10.0,-11.0,306,59
    5,306,59,-0.674740942224,12.8062484749,15.5241746963,310,74,4.0,15.0,306,62
    6,306,62,1.57079632679,3.0,14.8660687473,316,51,10.0,-11.0,315,61
    7,315,61,-0.110657221174,9.05538513814,9.8488578018,306,65,-9.0,4.0,321,64
    """

    result = learner.predict(3, 278, 64, 0.3430239, 14.866068)

    print "Dest X:", result[0]
    print "Dest Y:", result[1]
    print "Angle :", result[2]
Exemplo n.º 17
0
def testlearner():
    '''
	test KNN and Linear regression learner
	'''

    Xdcp, Ydcp = _csv_read("data-classification-prob.csv")
    Xdrp, Ydrp = _csv_read(
        "data-ripple-prob.csv"
    )  # the data in numpy array now is string instead of float

    #divide data for train and test
    dcp_row_N = Xdcp.shape[0]
    drp_row_N = Xdrp.shape[0]
    trainperct = 0.6  # data for training is 60% of total data
    dcp_trp = int(dcp_row_N * trainperct)
    drp_trp = int(drp_row_N * trainperct)
    #testperct = 1.0 - trainperct # data for test's percent
    #data for training
    Xdcp_train = Xdcp[0:dcp_trp, :]
    Ydcp_train = np.zeros([dcp_trp, 1])
    Ydcp_train[:, 0] = Ydcp[0:dcp_trp]
    Xdrp_train = Xdrp[0:drp_trp, :]
    Ydrp_train = np.zeros([drp_trp, 1])
    Ydrp_train[:, 0] = Ydrp[0:drp_trp]
    #data for test (query)
    Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :]
    Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1])
    Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N]
    #Ydcp_test = [:, 0:col_n] = Xdata
    Xdrp_test = Xdrp[drp_trp:drp_row_N, :]
    Ydrp_test = np.zeros([drp_row_N - drp_trp, 1])
    Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N]

    #KNN learner

    # result of KNN learn, rows records k, training time cost, query time cost, total time cost, RMSError and Correlation coeffient
    KNN_dcp_result = np.zeros([7,
                               50])  # result of data-classification-prob.csv
    KNN_drp_result = np.zeros([7, 50])  # result of data-ripple-prob.csv

    for k in range(1, 51):
        KNN_lner = KNNLearner(k)
        KNN_dcp_result[0][k - 1] = k
        KNN_drp_result[0][k - 1] = k

        # results of data-classification-prob.csv
        stime = time.time()
        KNN_lner.addEvidence(Xdcp_train, Ydcp_train)
        etime = time.time()
        KNN_dcp_result[1][k -
                          1] = (etime - stime) / dcp_trp  # training time cost

        stime = time.time()
        Ydcp_learn = KNN_lner.query(Xdcp_test)
        etime = time.time()
        KNN_dcp_result[2][k - 1] = (etime - stime) / (dcp_row_N - dcp_trp
                                                      )  # query time cost

        KNN_dcp_result[3][k - 1] = KNN_dcp_result[1][
            k - 1] + KNN_dcp_result[2][k - 1]  # total time cost

        #print Ydcp_test
        #print Ydcp_learn
        KNN_dcp_result[4][k - 1] = RMSE(Ydcp_test,
                                        Ydcp_learn)  # Root-Mean-square error

        KNN_dcp_result[5][k - 1] = np.corrcoef(
            Ydcp_learn.T, Ydcp_test.T)[0][1]  # correlation coefficient

        Ydcp_osp = KNN_lner.query(Xdcp_train)
        KNN_dcp_result[6][k - 1] = RMSE(
            Ydcp_train,
            Ydcp_osp)  # the RMS error between in-sample and out-sample data

        # results of data-ripple-prob.csv
        stime = time.time()
        KNN_lner.addEvidence(Xdrp_train, Ydrp_train)
        etime = time.time()
        KNN_drp_result[1][k -
                          1] = (etime - stime) / drp_trp  # training time cost

        stime = time.time()
        Ydrp_learn = KNN_lner.query(Xdrp_test)
        etime = time.time()
        KNN_drp_result[2][k - 1] = (etime - stime) / (drp_row_N - drp_trp
                                                      )  # query time cost

        KNN_drp_result[3][k - 1] = KNN_drp_result[1][
            k - 1] + KNN_drp_result[2][k - 1]  # total time cost

        KNN_drp_result[4][k - 1] = RMSE(Ydrp_test,
                                        Ydrp_learn)  # Root-Mean-Square error

        KNN_drp_result[5][k - 1] = np.corrcoef(
            Ydrp_learn.T, Ydrp_test.T)[0][1]  # correlation coefficient

        # insample and outsample error of ripple
        Ydrp_osp = KNN_lner.query(Xdrp_train)
        KNN_drp_result[6][k - 1] = RMSE(
            Ydrp_train,
            Ydrp_osp)  # the RMS error between in-sample and out-sample data

        #plot the predicted Y vesus actual Y when K = 3
        if k == 27:
            # plot the Y data of classification data
            plt.clf()
            fig = plt.figure()
            fig.suptitle('Y of classification data')
            #f1 = fig.add_subplot(2, 1, 1)
            plt.plot(Ydcp_test, Ydcp_learn, 'o', markersize=5)
            plt.xlabel('Actual Y')
            plt.ylabel('Predicted Y')
            #f1.set_title('data-classcification-prob.csv')
            fig.savefig('classification_Y.pdf', format='pdf')

        if k == 3:
            # plot the Y data of ripple data
            #f2 = fig.add_subplot(2, 1, 2)
            plt.clf()
            fig = plt.figure()
            fig.suptitle('Y of ripple data')
            plt.plot(Ydrp_test, Ydrp_learn, 'o', markersize=5)
            plt.xlabel('Actual Y')
            plt.ylabel('Predicted Y')
            #f2.set_title('data-ripple-prob.csv')
            fig.savefig('ripple_Y.pdf', format='pdf')

    print KNN_dcp_result[:, 2]  #the result of k=3 for dcp.csv
    Kdcp_best_pos = np.argmax(KNN_dcp_result[
        5, :])  #the indices of the maximum correlation coeffiecient
    print KNN_dcp_result[:, Kdcp_best_pos]

    print KNN_drp_result[:, 2]  #the result of k=3 for drp.csv
    Kdrp_best_pos = np.argmax(
        KNN_drp_result[5, :])  #the indices of the maximum correlation
    print KNN_drp_result[:, Kdrp_best_pos]

    #plot the correlation
    plt.clf()
    fig = plt.figure()
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[5, :],
             'r',
             label='Classification')
    plt.plot(KNN_drp_result[0, :], KNN_drp_result[5, :], 'b', label='Ripple')
    plt.legend()
    plt.xlabel('K')
    plt.ylabel('Correlation Coefficient')
    fig.savefig('Correlation_KNN.pdf', format='pdf')

    #plot the error between in sample and out-of-sample data
    plt.clf()
    fig = plt.figure()
    #f1 = fig.add_subplot(2, 1, 1)
    fig.suptitle('RMS error of classification data')
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[4, :],
             'or',
             label='out of sample')
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[6, :],
             'ob',
             label='in sample')
    #f1.axis([0:0.1:1.0]
    plt.legend(loc=4)
    plt.xlabel('K')
    plt.ylabel('RMS Error')

    fig.savefig('classification-RMSE.pdf', format='pdf')
    #f1.set_title('data-classification-prob.csv')

    #f2 = fig.add_subplot(2, 1, 2)
    plt.clf()
    fig = plt.figure()
    fig.suptitle('RMS error of ripple data')
    plt.plot(KNN_drp_result[0, :],
             KNN_drp_result[4, :],
             'or',
             label='out of sample')
    plt.plot(KNN_drp_result[0, :],
             KNN_drp_result[6, :],
             'ob',
             label='in sample')
    #f2.axis([0:0.1:1.0]
    plt.legend(loc=4)
    plt.xlabel('K')
    plt.ylabel('RMS Error')
    #f2.set_title('data-ripple-prob.csv')
    plt.savefig('ripple-RMSE.pdf', format='pdf')

    # plot the train time
    plt.clf()
    fig = plt.figure()
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[1, :],
             'r',
             label='Classification')
    plt.plot(KNN_drp_result[0, :], KNN_drp_result[1, :], 'b', label='Ripple')
    plt.legend(loc=1)
    plt.xlabel('K')
    plt.ylabel('train time / s')
    fig.savefig('traintime.pdf', format='pdf')

    # plot the query time
    plt.clf()
    fig = plt.figure()
    plt.plot(KNN_dcp_result[0, :],
             KNN_dcp_result[2, :],
             'r',
             label='Classification')
    plt.plot(KNN_drp_result[0, :], KNN_drp_result[2, :], 'b', label='Ripple')
    plt.legend(loc=4)
    plt.xlabel('K')
    plt.ylabel('query time / s')
    fig.savefig('querytime.pdf', format='pdf')

    # Linear regression
    LR_lner = LinRegLearner()
    LR_dcp_result = np.zeros(
        5)  #Linear regression results of data-classification-prob.csv
    LR_drp_result = np.zeros(
        5)  #Linear regression results of data-ripple-prob.csv

    # results of data-classification-prob.csv
    stime = time.time()
    dcp_cof = LR_lner.addEvidence(Xdcp_train, Ydcp_train)
    etime = time.time()
    LR_dcp_result[0] = (etime - stime) / dcp_trp  # train time cost

    stime = time.time()
    Ydcp_LRL = LR_lner.query(Xdcp_test, dcp_cof)
    etime = time.time()
    LR_dcp_result[1] = (etime - stime) / (dcp_row_N - dcp_trp
                                          )  # query time cost

    LR_dcp_result[2] = LR_dcp_result[0] + LR_dcp_result[1]  # total time cost

    LR_dcp_result[3] = RMSE(Ydcp_test, Ydcp_LRL)  # root-mean-square error

    LR_dcp_result[4] = np.corrcoef(Ydcp_test.T,
                                   Ydcp_LRL.T)[0][1]  # correlation efficient

    print LR_dcp_result

    # results of data-ripple-prob.csv
    stime = time.time()
    drp_cof = LR_lner.addEvidence(Xdrp_train, Ydrp_train)
    etime = time.time()
    LR_drp_result[0] = (etime - stime) / drp_trp  # train time cost

    stime = time.time()
    Ydrp_LRL = LR_lner.query(Xdrp_test, drp_cof)
    etime = time.time()
    LR_drp_result[1] = (etime - stime) / (drp_row_N - drp_trp
                                          )  # query time cost

    LR_drp_result[2] = LR_drp_result[0] + LR_drp_result[1]  # total time cost

    LR_drp_result[3] = RMSE(Ydrp_test, Ydrp_LRL)  # root-mean-square error

    LR_drp_result[4] = np.corrcoef(Ydrp_test.T,
                                   Ydrp_LRL.T)[0][1]  # correlation efficient

    print LR_drp_result
Exemplo n.º 18
0
def main():
  trainpercent = 60
  methods = ['mean','median']
  
  #read data from data file
  input = np.loadtxt('data-ripple-prob.csv', delimiter=',')
  trainsize = math.floor(input.shape[0]*trainpercent/100)

  #split data into train and test sets 
  Xtrain = input[0:trainsize,:-1]
  Ytrain = input[0:trainsize,-1]
  Xtest = input[trainsize:,:-1]
  Ytest = input[trainsize:,-1]

  MAXK = 30
  NUMCOLS = 5
  
  meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
  medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
  
  for method in methods:
    stats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
    
    for k in range(1, MAXK+1):
      #instantiate learner and test
      learner = KNNLearner(k, method)
    
      #get start time
      trainstarttime = dt.datetime.now()
      learner.addEvidence(Xtrain, Ytrain)
      #get end time and print total time for adding evidnece
      trainendtime = dt.datetime.now()
    
      #get start time
      teststarttime = dt.datetime.now()
      Y = learner.query(Xtest)
      #get end time and print total time for testing
      testendtime = dt.datetime.now()
      
      stats[k-1, 0] = k
      stats[k-1, 1] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0]
      stats[k-1, 2] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0]
      
      kdtlearner = kdtknn(k, method)
      #get start time
      trainstarttime = dt.datetime.now()
      kdtlearner.addEvidence(Xtrain, Ytrain)
      #get end time and print total time for adding evidnece
      trainendtime = dt.datetime.now()
    
      #get start time
      teststarttime = dt.datetime.now()
      Y = kdtlearner.query(Xtest)
      #get end time and print total time for testing
      testendtime = dt.datetime.now()
      
      stats[k-1, 3] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0]
      stats[k-1, 4] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0]
      
    if method == 'median':
      medianstats = stats.copy()
    else: 
      meanstats = stats.copy()
  
  #Graph for time/instance versus corrcoef
  timedelta = 0.001
  outputfilenames = ['mytraining.pdf', 'myquery.pdf', 'kdtknntraining.pdf', 'kdtknnquery.pdf']
  titles = ['mytrainingtime/instance', 'myquerytime/instance', 'kdtknntrainingtime/instance', 'kdtknnquerytime/instance']
  for index in range(1, NUMCOLS):
    plt.cla()
    plt.clf()
    plt.plot(meanstats[:,0], meanstats[:,index], color='r')
    plt.plot(medianstats[:,0], medianstats[:,index], color='b')
    plt.legend(('method=mean', 'method=median'), loc='upper right')
    plt.ylabel(titles[index-1])
    plt.xlabel('k')
    plt.ylim(min(min(meanstats[:,index]), min(medianstats[:,index]))-timedelta, max(max(meanstats[:,index]), max(medianstats[:,index]))+timedelta)
    plt.savefig(outputfilenames[index-1],format='pdf')
def main():
    trainpercent = 60
    isRandomSplit = False

    filenames = ['data-classification-prob.csv', 'data-ripple-prob.csv']
    outputfilenames = ['plot1.pdf', 'plot2.pdf']
    trainfilenames = ['traintime1.pdf', 'traintime2.pdf']
    testfilenames = ['testtime1.pdf', 'testtime2.pdf']
    methods = ['mean', 'median']

    for index in range(2):
        #read data from data file
        input = np.loadtxt(filenames[index], delimiter=',')
        trainsize = math.floor(input.shape[0] * trainpercent / 100)

        #split data into train and test sets
        Xtrain = input[0:trainsize, :-1]
        Ytrain = input[0:trainsize, -1]
        Xtest = input[trainsize:, :-1]
        Ytest = input[trainsize:, -1]

        MAXK = 300
        NUMCOLS = 4

        meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
        medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)

        avgtraintime = -1
        avgtesttime = -1

        for method in methods:
            stats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
            bestcorr = -1000
            bestK = -1

            for k in range(1, MAXK + 1):
                #instantiate learner and test
                learner = KNNLearner(k, method)

                #get start time
                trainstarttime = dt.datetime.now()
                learner.addEvidence(Xtrain, Ytrain)
                #get end time and print total time for adding evidnece
                trainendtime = dt.datetime.now()

                #get start time
                teststarttime = dt.datetime.now()
                Y = learner.query(Xtest)
                #get end time and print total time for testing
                testendtime = dt.datetime.now()

                #compute corrcoef
                corr = np.corrcoef(Ytest.T, Y.T)
                if corr[0, 1] > bestcorr:
                    bestcorr = corr[0, 1]
                    bestK = k

                stats[k - 1, 0] = k
                stats[k - 1, 1] = corr[0, 1]
                #The total_seconds() method works in python >= 2.7
                #stats[k-1, 2] = (trainendtime - trainstarttime).total_seconds()/Xtrain.shape[0]
                #stats[k-1, 3] = (testendtime - teststarttime).total_seconds()/Xtest.shape[0]
                stats[k - 1, 2] = gettotalseconds(
                    trainstarttime, trainendtime) / Xtrain.shape[0]
                stats[k - 1, 3] = gettotalseconds(teststarttime,
                                                  testendtime) / Xtest.shape[0]

                if k == 3 and method == 'mean':
                    avgtraintime = stats[k - 1, 2]
                    avgtesttime = stats[k - 1, 3]

            print 'File:%s Method:%s BestCorrelation:%f K corresponding to best correlation:%f AvgTrainTimeForK3Mean :%f seconds AvgTestTimeForK3Mean:%f seconds' % (
                filenames[index], method, bestcorr, bestK, avgtraintime,
                avgtesttime)

            if method == 'median':
                medianstats = stats.copy()
            else:
                meanstats = stats.copy()

        timedelta = 1

        #Graph for k versus corrcoef
        plt.cla()
        plt.clf()
        plt.plot(meanstats[:, 0], meanstats[:, 1], color='r')
        plt.plot(medianstats[:, 0], medianstats[:, 1], color='b')
        plt.legend(('method=mean', 'method=median'), loc='upper right')
        plt.ylabel('Correlation Coefficient')
        plt.xlabel('k')
        plt.savefig(outputfilenames[index], format='pdf')
Exemplo n.º 20
0
corr_coef_knn_out = np.zeros((n))

rms_lr_in = 0
corr_coef_lr_in = 0
rms_lr_out = 0
corr_coef_lr_out = 0
p = 1
plt.clf()
plt.plot(Xtrain)
plt.show()

K = np.zeros((n))
Y_best_knn = []
for k in range(1, n + 1):
    K[k - 1] = k
    learner = KNNLearner.KNNLearner(k)
    learner.addEvidence(Xtrain, Ytrain)
    Y_out_knn = learner.query(Xtest)

    sum = 0
    for i in range(len(Y_out_knn)):
        sum += math.pow((Y_out_knn[i] - Ytest[i]), 2)
    rms_knn_out[k - 1] = math.sqrt(sum / len(Y_out_knn))
    corr_coef_knn_out[k - 1] = np.corrcoef(Y_out_knn, Ytest)[0, 1]

    learner.addEvidence(X, Y)
    Y_in_knn = learner.query(Xtest)

    sum = 0
    for i in range(len(Y_in_knn)):
        sum += math.pow((Y_in_knn[i] - Ytest[i]), 2)
    print "corr: ", c[0,1]

    """

    print "---------------- KNN ---------------"

    inSampleError = []
    outOfSampleError = []
    kArr = []
    for i in range(3, 4):
        #learner = bl.BagLearner(learner = knn.KNNLearner, kwargs = {"k":3}, bags = 20, boost = False)
        #learner.addEvidence(trainX, trainY)

        print "BAGS:"
        print i
        learner = knn.KNNLearner(k=3)
        learner.addEvidence(trainX, trainY)  # tra

        kArr.append(i)
        #learner = knn.KNNLearner(i)
        #learner.addEvidence(trainX, trainY) # train it
        predY = learner.query(trainX)  # get the predictions
        rmse = math.sqrt(((trainY - predY)**2).sum() / trainY.shape[0])
        print "In sample results"
        print "RMSE: ", rmse
        inSampleError.append(rmse)
        c = np.corrcoef(predY, y=trainY)
        print "corr: ", c[0, 1]
        predY = learner.query(testX)  # get the predictions
        rmse = math.sqrt(((testY - predY)**2).sum() / testY.shape[0])
        print "Out of sample results"
    # separate out training and testing data
    #    trainX = data[:train_rows,0:-1]
    #    trainY = data[:train_rows,-1]
    #    testX = data[train_rows:,0:-1]
    #    testY = data[train_rows:,-1]
    trainX = dataTrain[:, 0:-1]
    trainY = dataTrain[:, -1]
    testX = dataTest[:, 0:-1]
    testY = dataTest[:, -1]

    #    print(testX.shape)
    #    print(testY.shape)

    # create a learner and train it
    start = time.time()
    learner = knn.KNNLearner(k=3, verbose=True)  # create a KNNLearner
    learner.add_evidence(trainX, trainY)  # train it

    # evaluate in sample
    Y = learner.query(trainX)  # get the predictions
    rmse = math.sqrt(((trainY - Y)**2).sum() / trainY.shape[0])
    #    print(learner.model_coefs)
    print("In sample results")
    print("RMSE: ", rmse)
    corr = np.corrcoef(Y, y=trainY)
    print("corr: ", corr[0, 1])
    #
    # evaluate out of sample
    Y = learner.query(testX)  # get the predictions
    rmse = math.sqrt(((testY - Y)**2).sum() / testY.shape[0])
    print
Exemplo n.º 23
0
def testlearner():
	'''
	test KNN and Linear regression learner
	'''

	Xdcp, Ydcp = _csv_read("data-classification-prob.csv")
	Xdrp, Ydrp = _csv_read("data-ripple-prob.csv") # the data in numpy array now is string instead of float
	
	#divide data for train and test
	dcp_row_N = Xdcp.shape[0]
	drp_row_N = Xdrp.shape[0]
	trainperct = 0.6 # data for training is 60% of total data
	dcp_trp = int(dcp_row_N * trainperct)
	drp_trp = int(drp_row_N * trainperct)
	#testperct = 1.0 - trainperct # data for test's percent 
	#data for training
	Xdcp_train = Xdcp[0:dcp_trp, :]
	Ydcp_train = np.zeros([dcp_trp, 1])
	Ydcp_train[:, 0] = Ydcp[0:dcp_trp]
	Xdrp_train = Xdrp[0:drp_trp, :]
	Ydrp_train = np.zeros([drp_trp, 1])
	Ydrp_train[:, 0] = Ydrp[0:drp_trp]
	#data for test (query)
	Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :]
	Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1])
	Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N]
	#Ydcp_test = [:, 0:col_n] = Xdata
	Xdrp_test = Xdrp[drp_trp:drp_row_N, :]
	Ydrp_test = np.zeros([drp_row_N - drp_trp, 1])
	Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N]

	

	#KNN learner

	# result of KNN learn, rows records k, training time cost, query time cost, total time cost, RMSError and Correlation coeffient
	KNN_dcp_result = np.zeros([7, 50]) # result of data-classification-prob.csv
	KNN_drp_result = np.zeros([7, 50]) # result of data-ripple-prob.csv

	for k in range(1, 51):
		KNN_lner = KNNLearner(k)
		KNN_dcp_result[0][k-1] = k
		KNN_drp_result[0][k-1] = k
		
		# results of data-classification-prob.csv
		stime = time.time()
		KNN_lner.addEvidence(Xdcp_train, Ydcp_train)
		etime = time.time()
		KNN_dcp_result[1][k-1] = (etime - stime) / dcp_trp # training time cost

		stime = time.time()
		Ydcp_learn = KNN_lner.query(Xdcp_test)
		etime = time.time()
		KNN_dcp_result[2][k-1] = (etime - stime) / (dcp_row_N - dcp_trp) # query time cost

		KNN_dcp_result[3][k-1] = KNN_dcp_result[1][k-1] + KNN_dcp_result[2][k-1] # total time cost
		
		#print Ydcp_test
		#print Ydcp_learn
		KNN_dcp_result[4][k-1] = RMSE(Ydcp_test, Ydcp_learn) # Root-Mean-square error

		KNN_dcp_result[5][k-1] = np.corrcoef(Ydcp_learn.T, Ydcp_test.T)[0][1] # correlation coefficient

		Ydcp_osp = KNN_lner.query(Xdcp_train)
		KNN_dcp_result[6][k-1] = RMSE(Ydcp_train, Ydcp_osp) # the RMS error between in-sample and out-sample data
		
		# results of data-ripple-prob.csv
		stime = time.time()
		KNN_lner.addEvidence(Xdrp_train, Ydrp_train)
		etime = time.time()
		KNN_drp_result[1][k-1] = (etime - stime) / drp_trp # training time cost

		stime = time.time()
		Ydrp_learn = KNN_lner.query(Xdrp_test)
		etime = time.time()
		KNN_drp_result[2][k-1] = (etime - stime) / (drp_row_N - drp_trp) # query time cost

		KNN_drp_result[3][k-1] = KNN_drp_result[1][k-1] + KNN_drp_result[2][k-1] # total time cost

		KNN_drp_result[4][k-1] = RMSE(Ydrp_test, Ydrp_learn) # Root-Mean-Square error

		KNN_drp_result[5][k-1] = np.corrcoef(Ydrp_learn.T, Ydrp_test.T)[0][1] # correlation coefficient

		# insample and outsample error of ripple
		Ydrp_osp = KNN_lner.query(Xdrp_train)
		KNN_drp_result[6][k-1] = RMSE(Ydrp_train, Ydrp_osp) # the RMS error between in-sample and out-sample data

		#plot the predicted Y vesus actual Y when K = 3
		if k == 27:
			# plot the Y data of classification data
			plt.clf()
			fig = plt.figure()
			fig.suptitle('Y of classification data')
			#f1 = fig.add_subplot(2, 1, 1)
			plt.plot(Ydcp_test, Ydcp_learn, 'o', markersize = 5)
			plt.xlabel('Actual Y')
			plt.ylabel('Predicted Y')
			#f1.set_title('data-classcification-prob.csv')
			fig.savefig('classification_Y.pdf', format = 'pdf')

		if k == 3:
			# plot the Y data of ripple data
			#f2 = fig.add_subplot(2, 1, 2)
			plt.clf()
			fig = plt.figure()
			fig.suptitle('Y of ripple data')
			plt.plot(Ydrp_test, Ydrp_learn, 'o', markersize = 5)
			plt.xlabel('Actual Y')
			plt.ylabel('Predicted Y')
			#f2.set_title('data-ripple-prob.csv')
			fig.savefig('ripple_Y.pdf', format = 'pdf')

	print KNN_dcp_result[:, 2] #the result of k=3 for dcp.csv
	Kdcp_best_pos = np.argmax(KNN_dcp_result[5, :])	#the indices of the maximum correlation coeffiecient
	print KNN_dcp_result[:, Kdcp_best_pos]

	print KNN_drp_result[:, 2] #the result of k=3 for drp.csv
	Kdrp_best_pos = np.argmax(KNN_drp_result[5, :]) #the indices of the maximum correlation
	print KNN_drp_result[:, Kdrp_best_pos]

	#plot the correlation
	plt.clf()
	fig = plt.figure()
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[5, :], 'r', label = 'Classification')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[5, :], 'b', label = 'Ripple')
	plt.legend()
	plt.xlabel('K')
	plt.ylabel('Correlation Coefficient')
	fig.savefig('Correlation_KNN.pdf', format = 'pdf')

	#plot the error between in sample and out-of-sample data
	plt.clf()
	fig = plt.figure()
	#f1 = fig.add_subplot(2, 1, 1)
	fig.suptitle('RMS error of classification data')
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[4, :], 'or', label = 'out of sample')
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[6, :], 'ob', label = 'in sample')
	#f1.axis([0:0.1:1.0]
	plt.legend(loc = 4)
	plt.xlabel('K')
	plt.ylabel('RMS Error')

	fig.savefig('classification-RMSE.pdf', format = 'pdf')
	#f1.set_title('data-classification-prob.csv')
	
	#f2 = fig.add_subplot(2, 1, 2)
	plt.clf()
	fig = plt.figure()
	fig.suptitle('RMS error of ripple data')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[4, :], 'or', label = 'out of sample')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[6, :], 'ob', label = 'in sample')
	#f2.axis([0:0.1:1.0]
	plt.legend(loc = 4)
	plt.xlabel('K')
	plt.ylabel('RMS Error')
	#f2.set_title('data-ripple-prob.csv')
	plt.savefig('ripple-RMSE.pdf', format = 'pdf')

	# plot the train time
	plt.clf()
	fig = plt.figure()
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[1, :], 'r', label = 'Classification')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[1, :], 'b', label = 'Ripple')
	plt.legend(loc=1)
	plt.xlabel('K')
	plt.ylabel('train time / s')
	fig.savefig('traintime.pdf', format = 'pdf')

	# plot the query time
	plt.clf()
	fig = plt.figure()
	plt.plot(KNN_dcp_result[0, :], KNN_dcp_result[2, :], 'r', label = 'Classification')
	plt.plot(KNN_drp_result[0, :], KNN_drp_result[2, :], 'b', label = 'Ripple')
	plt.legend(loc=4)
	plt.xlabel('K')
	plt.ylabel('query time / s')
	fig.savefig('querytime.pdf', format = 'pdf')
	


	# Linear regression
	LR_lner = LinRegLearner()
	LR_dcp_result = np.zeros(5)	#Linear regression results of data-classification-prob.csv
	LR_drp_result = np.zeros(5) #Linear regression results of data-ripple-prob.csv
	
	# results of data-classification-prob.csv
	stime = time.time()
	dcp_cof = LR_lner.addEvidence(Xdcp_train, Ydcp_train)
	etime = time.time()
	LR_dcp_result[0] = (etime - stime) / dcp_trp# train time cost

	stime = time.time()
	Ydcp_LRL = LR_lner.query(Xdcp_test, dcp_cof)
	etime = time.time()
	LR_dcp_result[1] = (etime - stime) / (dcp_row_N - dcp_trp) # query time cost

	LR_dcp_result[2] = LR_dcp_result[0] + LR_dcp_result[1] # total time cost

	LR_dcp_result[3] = RMSE(Ydcp_test, Ydcp_LRL) # root-mean-square error

	LR_dcp_result[4] = np.corrcoef(Ydcp_test.T, Ydcp_LRL.T)[0][1] # correlation efficient

	print LR_dcp_result

	# results of data-ripple-prob.csv
	stime = time.time()
	drp_cof = LR_lner.addEvidence(Xdrp_train, Ydrp_train)
	etime = time.time()
	LR_drp_result[0] = (etime - stime) / drp_trp # train time cost

	stime = time.time()
	Ydrp_LRL = LR_lner.query(Xdrp_test, drp_cof)
	etime = time.time()
	LR_drp_result[1] = (etime - stime) / (drp_row_N - drp_trp) # query time cost

	LR_drp_result[2] = LR_drp_result[0] + LR_drp_result[1] # total time cost

	LR_drp_result[3] = RMSE(Ydrp_test, Ydrp_LRL) # root-mean-square error

	LR_drp_result[4] = np.corrcoef(Ydrp_test.T, Ydrp_LRL.T)[0][1] # correlation efficient
	
	print LR_drp_result
Exemplo n.º 24
0
def test_KNN(X_whole, y_whole, X, y):
    

    # Split the initial data
    xtrain , xtest ,ytrain, ytest = train_test_split(X,y,test_size =0.2,random_state =42)

    start=datetime.now()

    ### NNLearner Implementation ###
    knnlearner = knn.KNNLearner(n_folds=3, verbose=True)  

    # Create a validation set - do another train/test split on the training data
    xtrain_val , xtest_val ,ytrain_val, ytest_val = train_test_split(X,y,test_size =0.2,random_state =42)

    ########## Initial Learning Curves for Different Neighbor Sizes ##########

    # 2 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=2)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 2 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_2neigh.png')

    # 4 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=4)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 4 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_4neigh.png')

    # 6 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=6)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 6 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_6neigh.png')

    # 8 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=8)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 8 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_8neigh.png')

    # 10 neighbors
    # Initial Fit
    initial_classifier = KNeighborsClassifier(n_neighbors=10)
    initial_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Initial Learning Curves (KNN - 10 neighbors)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = initial_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve_initial_10neigh.png')

    # Get a list of possible knn's and their respective neighbor_types
    flag = 0
    clfs, neighbor_types = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the neighbor_type with highest accuracy
    weight_values = "NA"
    algorithm_types = "NA"
    metric_types = "NA"
    p_values = "NA"
    knn_choice_neighbor_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Get a list of possible knns and their respective weight values
    flag = 1
    clfs, weight_values = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the weight with highest accuracy
    neighbor_types = "NA"
    algorithm_types = "NA"
    metric_types = "NA"
    p_values = "NA"
    knn_choice_weight_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Get a list of possible knns and their respective algorithm_types
    flag = 2
    clfs, algorithm_types = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the algorithm with highest accuracy
    neighbor_types = "NA"
    weight_values = "NA"
    metric_types = "NA"
    p_values = "NA"
    knn_choice_algorithm_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Get a list of possible knns and their respective metric types
    flag = 3
    clfs, metric_types = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the metric with highest accuracy
    neighbor_types = "NA"
    weight_values = "NA"
    algorithm_types = "NA"
    p_values = "NA"
    knn_choice_metric_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Get a list of possible knns and their respective p values
    flag = 4
    clfs, p_values = knnlearner.train(xtrain_val,ytrain_val,flag)
    # Get the knn that is correlated to the p value with highest accuracy
    neighbor_types = "NA"
    weight_values = "NA"
    algorithm_types = "NA"
    metric_types = ['minkowski']
    knn_choice_metric_based = knnlearner.test(xtest_val,xtrain_val,ytest_val,ytrain_val,clfs,neighbor_types, weight_values, algorithm_types, metric_types, p_values, flag)

    # Now that we have the knn, time for tuning hyperparameters
    # Make a new classifier for this
    clf = KNeighborsClassifier()
    clf.fit(xtrain_val, ytrain_val)
    best_params = knnlearner.tune_hyperparameters(clf, xtrain_val, ytrain_val)
    print("Best params are: ", best_params)

    # Now do one more fit based on best params above
    final_classifier = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'],weights=best_params['weights'], algorithm=best_params['algorithm'],metric=best_params['metric'],p=best_params['p'])
    final_classifier.fit(xtrain_val, ytrain_val)

    fig, axes = plt.subplots(3, 1, figsize=(10, 15))

    title = "Learning Curves (KNN)"
    # Cross validation with 100 iterations to get smoother mean test and train
    # score curves, each time with 20% data randomly selected as a validation set.
    cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)

    estimator = final_classifier
    lc = plot_learning_curve(estimator, title, xtrain_val, ytrain_val, cv=cv, n_jobs=-1)


    lc.savefig('/Users/ajinkya.bagde/Desktop/AS1_Figs/KNN/knn_learningcurve.png')

    # Now time for final accuracy score for test set
    knnlearner.final_test(final_classifier,xtest,ytest)

    print(datetime.now()-start)
Exemplo n.º 25
0
def main():
    trainpercent = 60
    methods = ['mean', 'median']

    #read data from data file
    input = np.loadtxt('data-ripple-prob.csv', delimiter=',')
    trainsize = math.floor(input.shape[0] * trainpercent / 100)

    #split data into train and test sets
    Xtrain = input[0:trainsize, :-1]
    Ytrain = input[0:trainsize, -1]
    Xtest = input[trainsize:, :-1]
    Ytest = input[trainsize:, -1]

    MAXK = 30
    NUMCOLS = 5

    meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
    medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)

    for method in methods:
        stats = np.zeros((MAXK, NUMCOLS), dtype=np.float)

        for k in range(1, MAXK + 1):
            #instantiate learner and test
            learner = KNNLearner(k, method)

            #get start time
            trainstarttime = dt.datetime.now()
            learner.addEvidence(Xtrain, Ytrain)
            #get end time and print total time for adding evidnece
            trainendtime = dt.datetime.now()

            #get start time
            teststarttime = dt.datetime.now()
            Y = learner.query(Xtest)
            #get end time and print total time for testing
            testendtime = dt.datetime.now()

            stats[k - 1, 0] = k
            stats[k - 1, 1] = gettotalseconds(trainstarttime,
                                              trainendtime) / Xtrain.shape[0]
            stats[k - 1, 2] = gettotalseconds(teststarttime,
                                              testendtime) / Xtest.shape[0]

            kdtlearner = kdtknn(k, method)
            #get start time
            trainstarttime = dt.datetime.now()
            kdtlearner.addEvidence(Xtrain, Ytrain)
            #get end time and print total time for adding evidnece
            trainendtime = dt.datetime.now()

            #get start time
            teststarttime = dt.datetime.now()
            Y = kdtlearner.query(Xtest)
            #get end time and print total time for testing
            testendtime = dt.datetime.now()

            stats[k - 1, 3] = gettotalseconds(trainstarttime,
                                              trainendtime) / Xtrain.shape[0]
            stats[k - 1, 4] = gettotalseconds(teststarttime,
                                              testendtime) / Xtest.shape[0]

        if method == 'median':
            medianstats = stats.copy()
        else:
            meanstats = stats.copy()

    #Graph for time/instance versus corrcoef
    timedelta = 0.001
    outputfilenames = [
        'mytraining.pdf', 'myquery.pdf', 'kdtknntraining.pdf',
        'kdtknnquery.pdf'
    ]
    titles = [
        'mytrainingtime/instance', 'myquerytime/instance',
        'kdtknntrainingtime/instance', 'kdtknnquerytime/instance'
    ]
    for index in range(1, NUMCOLS):
        plt.cla()
        plt.clf()
        plt.plot(meanstats[:, 0], meanstats[:, index], color='r')
        plt.plot(medianstats[:, 0], medianstats[:, index], color='b')
        plt.legend(('method=mean', 'method=median'), loc='upper right')
        plt.ylabel(titles[index - 1])
        plt.xlabel('k')
        plt.ylim(
            min(min(meanstats[:, index]), min(medianstats[:, index])) -
            timedelta,
            max(max(meanstats[:, index]), max(medianstats[:, index])) +
            timedelta)
        plt.savefig(outputfilenames[index - 1], format='pdf')
Exemplo n.º 26
0
    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]

    total = 15
    in_sample_error = []
    out_sample_error = []
    k_values = []

    for i in range(total):
        # print "KNNLearner"
        k_values.append(i + 1)
        learner = knn.KNNLearner(k=i + 1)  # create a KNNLearner

        learner.addEvidence(trainX, trainY)  # train it

        # evaluate in sample
        predY = learner.query(trainX)  # get the predictions
        rmse = math.sqrt(((trainY - predY)**2).sum() / trainY.shape[0])
        # print
        # print "In sample results"
        # print "RMSE: ", rmse
        in_sample_error.append(rmse)
        c = np.corrcoef(predY, y=trainY)
        # print "corr: ", c[0, 1]

        # evaluate out of sample
        predY = learner.query(testX)  # get the predictions
Exemplo n.º 27
0
    #generate plot for current price, train price and predict price
    df2['pred'] = predY
    df2['fv'] = df2[symb] * (1 + df2['fr'])
    df2['pv'] = df2[symb] * (1 + df2['pred'])
    plot_compare(df2[symb], df2['fv'], df2['pv'])

    #use predict data to execute orders and plot long, short, exit as vertical lines
    le, se, s = operations(df2, symb)
    plot_trade(df2[symb], symb, le, se, s)

    #computer portvals plot backtest results
    portvals = compute_portvals("orders.csv", start_val=10000)
    norm_SPY = df2['SPY'] / df2['SPY'][0] * 10000
    plot_bt(portvals, symb, norm_SPY)

    #analysis of portfolio
    cr, adr, sddr, sr = analysis(portvals)
    print cr, adr, sddr, sr, portvals[-1]


#choose learner, stock symbol, training start date and end date, plus test strat date and end date
#in sample test, use same start date and end date
#out of sample test, use different start date and end date
test(learner=knn.KNNLearner(3),
     symb='IBM',
     train_sd=dt.datetime(2007, 12, 31),
     train_ed=dt.datetime(2009, 12, 31),
     test_sd=dt.datetime(2007, 12, 31),
     test_ed=dt.datetime(2009, 12, 31))
Exemplo n.º 28
0
    test_rows = data.shape[0] - train_rows

    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]

    in_rmse_k = []
    in_corr_k = []
    out_rmse_k = []
    out_corr_k = []
    model = 'knn'  #bag
    print 'shape of datatset:', data.shape
    for i in range(2, 21):
        learner = knn.KNNLearner(k=i, verbose=True)  # create a knnLearner
        # learner = bl.BagLearner(learner=knn.KNNLearner,
        #                        kwargs={"k": 5}, bags=20, boost=False, verbose=False)
        learner.addEvidence(trainX, trainY)  # train it

        predY_train = learner.query(trainX)  # get the predictions

        #get in sample stats
        in_rmse_k.append(
            math.sqrt(((trainY - predY_train)**2).sum() / trainY.shape[0]))
        c_in = np.corrcoef(predY_train, y=trainY)
        in_corr_k.append(c_in[0, 1])

        # get out of sample stats
        predY_test = learner.query(testX)  # get the predictions
        out_rmse_k.append(
def testlearner():
	'''
	test Random forest and compare with KNN
	'''

	Xdcp, Ydcp = _csv_read("data-classification-prob.csv")
	Xdrp, Ydrp = _csv_read("data-ripple-prob.csv") # the data in numpy array now is string instead of float
	
	#divide data for train and test
	dcp_row_N = Xdcp.shape[0]
	drp_row_N = Xdrp.shape[0]
	trainperct = 0.6 # data for training is 60% of total data
	dcp_trp = int(dcp_row_N * trainperct)
	drp_trp = int(drp_row_N * trainperct)
	#testperct = 1.0 - trainperct # data for test's percent 
	#data for training
	Xdcp_train = Xdcp[0:dcp_trp, :]
	Ydcp_train = np.zeros([dcp_trp, 1])
	Ydcp_train[:, 0] = Ydcp[0:dcp_trp]
	Xdrp_train = Xdrp[0:drp_trp, :]
	Ydrp_train = np.zeros([drp_trp, 1])
	Ydrp_train[:, 0] = Ydrp[0:drp_trp]
	#data for test (query)
	Xdcp_test = Xdcp[dcp_trp:dcp_row_N, :]
	Ydcp_test = np.zeros([dcp_row_N - dcp_trp, 1])
	Ydcp_test[:, 0] = Ydcp[dcp_trp:dcp_row_N]
	#Ydcp_test = [:, 0:col_n] = Xdata
	Xdrp_test = Xdrp[drp_trp:drp_row_N, :]
	Ydrp_test = np.zeros([drp_row_N - drp_trp, 1])
	Ydrp_test[:, 0] = Ydrp[drp_trp:drp_row_N]

	#print Xdcp_train

	# result of KNN learn, rows records k, training time cost, query time cost, RMSError and Correlation coeffient
	DT_dcp_result = np.zeros([5, 100]) # result of data-classification-prob.csv of RF
	DT_drp_result = np.zeros([5, 100]) # result of data-ripple-prob.csv of RF
	KNN_dcp_result = np.zeros([2, 100]) # results of data-classification-prob.csv of KNN
	KNN_drp_result = np.zeros([2, 100]) # results of data-ripple-prob.csv of KNN

	#print len(RFL.trees)
	for k in range(1, 101):
		#k = 30
		# Random forest learner
		RFL = RandomForestLearner(k)
		KNN_lner = KNNLearner(k)
		
		DT_dcp_result[0][k-1] = k
		DT_drp_result[0][k-1] = k
		# result of data-classification-prob
		stime = time.time()
		RFL.addEvidence(Xdcp_train, Ydcp_train)
		etime = time.time()
		DT_dcp_result[1][k-1] = etime - stime

		KNN_lner.addEvidence(Xdcp_train, Ydcp_train)

		#print len(RFL.trees)
		#RFL.trees[0].print_tree(RFL.trees[0].root)
		stime = time.time()
		Ydcp_learn = RFL.query(Xdcp_test)
		etime = time.time()
		DT_dcp_result[2][k-1] = etime - stime;

		Ydcp_learn_KNN = KNN_lner.query(Xdcp_test)

		DT_dcp_result[3][k-1] = RMSE(Ydcp_learn, Ydcp_test)
		KNN_dcp_result[0][k-1] = RMSE(Ydcp_learn_KNN, Ydcp_test)

		DT_dcp_result[4][k-1] = np.corrcoef(Ydcp_learn.T, Ydcp_test.T)[0][1]
		KNN_dcp_result[1][k-1] = np.corrcoef(Ydcp_learn_KNN.T, Ydcp_test.T)[0][1]

		# result of data-ripple
		#RFL1 = RandomForestLearner(k)
		stime = time.time()
		RFL.addEvidence(Xdrp_train, Ydrp_train)
		etime = time.time()
		DT_drp_result[1][k-1] = etime - stime

		KNN_lner.addEvidence(Xdrp_train, Ydrp_train)

		#print len(RFL.trees)
		#RFL.trees[0].print_tree(RFL.trees[0].root)
		stime = time.time()
		Ydrp_learn = RFL.query(Xdrp_test)
		etime = time.time()
		DT_drp_result[2][k-1] = etime - stime;

		Ydrp_learn_KNN = KNN_lner.query(Xdrp_test)

		#print Ydrp_learn_KNN

		DT_drp_result[3][k-1] = RMSE(Ydrp_learn, Ydrp_test)
		KNN_drp_result[0][k-1] = RMSE(Ydrp_learn_KNN, Ydrp_test)

		DT_drp_result[4][k-1] = np.corrcoef(Ydrp_learn.T, Ydrp_test.T)[0][1]
		KNN_drp_result[1][k-1] = np.corrcoef(Ydrp_learn_KNN.T, Ydrp_test.T)[0][1]
		#print DT_drp_result[4][k-1]
	
	plt.clf()
	fig = plt.figure()
	fig.suptitle('RMS Error of Classification data test')
	plt.plot(DT_dcp_result[0, :], DT_dcp_result[3, :], 'r', label = 'Random Forest')
	plt.plot(DT_dcp_result[0, :], KNN_dcp_result[0, :], 'b', label = 'KNN')
	plt.legend(loc = 1)
	plt.xlabel('K')
	plt.ylabel('RMS Error')
	fig.savefig('classification-RMSE.pdf', format = 'pdf')

	plt.clf()
	fig = plt.figure()
	fig.suptitle('Correlation Coefficient of Classification data test')
	plt.plot(DT_dcp_result[0, :], DT_dcp_result[4, :], 'r', label = 'Random Forest')
	plt.plot(DT_dcp_result[0, :], KNN_dcp_result[1, :], 'b', label = 'KNN')
	plt.legend(loc = 4)
	plt.xlabel('K')
	plt.ylabel('Correlation Coefficient')
	fig.savefig('classification-Corr.pdf', format = 'pdf')

	plt.clf()
	fig = plt.figure()
	fig.suptitle('RMS Error of Ripple data test')
	plt.plot(DT_drp_result[0, :], DT_drp_result[3, :], 'r', label = 'Random Forest')
	plt.plot(DT_drp_result[0, :], KNN_drp_result[0, :], 'b', label = 'KNN')
	plt.legend(loc = 2)
	plt.xlabel('K')
	plt.ylabel('RMS Error')
	fig.savefig('ripple-RMSE.pdf', format = 'pdf')

	plt.clf()
	fig = plt.figure()
	fig.suptitle('Correlation Coefficient of Ripple data test')
	plt.plot(DT_drp_result[0, :], DT_drp_result[4, :], 'r', label = 'Random Forest')
	plt.plot(DT_drp_result[0, :], KNN_drp_result[1, :], 'b', label = 'KNN')
	plt.legend(loc = 3)
	plt.xlabel('K')
	plt.ylabel('Correlation Coefficient')
	fig.savefig('ripple-Corr.pdf', format = 'pdf')
def run():
    # Define default parameters
    start_date = '2008-01-01'
    end_date = '2009-12-31'
    start_test_date = '2010-01-01'
    end_test_date = '2010-12-31'
    stock = 'IBM'

    #check for user input of stocks and date range
    if (len(sys.argv) > 1):
        file_path = "data/" + sys.argv[1] + ".csv"
        # Check if that file exists
        if not os.path.exists(file_path) or not os.path.isfile(file_path):
            print 'Data for the stock specified does not exist. Please reference stocks in the data folder, or run with no option provided (will display IBM data by default)'
            return
        stock = sys.argv[1]

    dates = pd.date_range(start_date, end_date)
    test_dates = pd.date_range(start_test_date, end_test_date)

    #read in data that you're going to use
    prices_all = get_data([stock], dates)  # automatically adds SPY
    test_prices_all = get_data([stock], test_dates)

    #set up dataframe to train learner over
    data = pd.DataFrame(index=dates)
    data['actual_prices'] = prices_all[stock]
    data['bb_value'] = prices_all[stock] - pd.rolling_mean(prices_all[stock],
                                                           window=5)
    data['bb_value'] = data['bb_value'] / (
        pd.rolling_std(prices_all[stock], window=5) * 2)
    data['momentum'] = (prices_all[stock] /
                        prices_all[stock].shift(periods=-5)) - 1
    data['volatility'] = pd.rolling_std(
        ((prices_all[stock] / prices_all[stock].shift(periods=-1)) - 1),
        window=5)
    data['y_values'] = prices_all[stock].shift(periods=-5)
    data = data.dropna(subset=['actual_prices'])
    trainX = data.iloc[4:, 0:-1]
    trainY = data.iloc[4:, -1]

    #set up data frame to test learner over
    test_data = pd.DataFrame(index=test_dates)
    test_data['actual_prices'] = test_prices_all[stock]
    test_data['bb_value'] = test_prices_all[stock] - pd.rolling_mean(
        test_prices_all[stock], window=5)
    test_data['bb_value'] = test_data['bb_value'] / (
        pd.rolling_std(test_prices_all[stock], window=5) * 2)
    test_data['momentum'] = (test_prices_all[stock] /
                             test_prices_all[stock].shift(periods=-5)) - 1
    test_data['volatility'] = pd.rolling_std(
        ((test_prices_all[stock] / test_prices_all[stock].shift(periods=-1)) -
         1),
        window=5)
    test_data['y_values'] = test_prices_all[stock].shift(periods=-5)
    test_data = test_data.dropna(subset=['actual_prices'])
    testX = test_data.iloc[:, 0:-1]
    testY = test_data.iloc[:, -1]

    #create a KNN Learner for the data and add evidence to it
    learner = knn.KNNLearner(3)
    learner.addEvidence(trainX, trainY)

    #run a simulation of the trading strategy based on predicted future values over training data
    print "\nTraining Data Results:"
    run_simulation(learner, prices_all, stock, trainX, trainY, dates,
                   "Unit3/orders/orders_trainingdata.csv")
    calculate_portfolio_value("Unit3/orders/orders_trainingdata.csv",
                              prices_all, dates, stock)

    #run a simulation of the trading strategy over previously unseen testing data to test it's performance
    print "\nTest Data Results:"
    run_simulation(learner, test_prices_all, stock, testX, testY, test_dates,
                   "Unit3/orders/orders_testdata.csv")
    calculate_portfolio_value("Unit3/orders/orders_testdata.csv",
                              test_prices_all, test_dates, stock)
Exemplo n.º 31
0
def TestKNN(filename, k=3, draw=0):
    reader = csv.reader(open(filename, 'rU'), delimiter=',')
    learner = KNN.KNNLearner(k)
    i = 0
    indata = None
    for row in reader:
        i = i + 1
        temp = numpy.zeros([1, 3])
        i = 0
        for elements in row:
            temp[0][i] = string.atof(elements)
            i = i + 1
        if indata is None:
            indata = temp
        else:
            indata = numpy.append(indata, temp, axis=0)
    start = time.clock()
    learner.addEvidence(indata[0:split])
    traintime = (time.clock() - start)
    print "Train time is ", traintime
    start = time.clock()
    yfitted = numpy.zeros([400])
    for i in range(600, 1000):
        yfitted[i - 600] = learner.query(indata[i])
    querytime = (time.clock() - start) / 400
    print "Query time is ", querytime
    cormat = numpy.corrcoef(indata[600:1000, 2], yfitted)
    print "Correlation coefficient of out sample data is \n", cormat[0][1]

    dif = yfitted - indata[600:1000, 2]
    RMS = 0
    for err in dif:
        RMS = RMS + err * err
    RMS = numpy.sqrt(RMS / 400)
    print "RMS of out sample data is ", RMS

    ytfitted = numpy.zeros([600])
    for i in range(0, 600):
        ytfitted[i] = learner.query(indata[i])
    cormatoft = numpy.corrcoef(indata[0:600, 2], ytfitted)
    print "Correlation coefficient of in sample data is \n", cormatoft[0][1]
    dif = ytfitted - indata[0:600, 2]
    RMSt = 0
    for err in dif:
        RMSt = RMSt + err * err
    RMSt = numpy.sqrt(RMSt / 600)
    print "RMS of in sample data is ", RMSt
    if (draw == 1):
        xax = numpy.zeros([400])
        for i in range(600, 1000):
            xax[i - 600] = i
        plt.plot(xax, yfitted, 'ro')
        plt.plot(xax, indata[600:1000, 2], 'bo')
        plt.show()

        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(indata[600:1000, 0],
                   indata[600:1000, 1],
                   indata[600:1000, 2],
                   c='b')
        ax.scatter(indata[600:1000, 0], indata[600:1000, 1], yfitted[:], c='r')
        plt.show()

    return k, traintime, querytime, cormat[0][1], cormatoft[0][1], RMS, RMSt
Exemplo n.º 32
0
bollingerBandDf = l.getBollingerBandVAlue(symbol, dates, volatilityDF)

stats = l.getStats(momentumDF, volatilityDF, bollingerBandDf)

bollingerBandDf = l.normalizeDataFrame(bollingerBandDf)
momentumDF = l.normalizeDataFrame(momentumDF)
volatilityDF = l.normalizeDataFrame(volatilityDF)

unalteredPrices = util.get_data([symbol], dates, addSPY=False).dropna()
fiveDayPriceChange, trainX, trainY, unalteredPrices = l.prepareTrainXandY(
    bollingerBandDf, fiveDayPriceChange, momentumDF, unalteredPrices,
    volatilityDF, symbol)

#Uncomment the LinRegl and comment the KNN Learner to use that instead of KNN
# learner = lrl.LinRegLearner(verbose = True) # create a LinRegLearner
learner = knn.KNNLearner(2, verbose=True)  # create a knn learner
learner.addEvidence(trainX, trainY)  # train it

predictedYFromTraining = learner.query(
    trainX
)  # get the predictions        sy = sknn.fit(trainX, trainY).predict(testX)
yPredictedDF = pd.DataFrame(predictedYFromTraining,
                            index=fiveDayPriceChange.index)
yPredTimesPriceDF = yPredictedDF.values * unalteredPrices
fiveDayPrices = fiveDayPriceChange.values * unalteredPrices
yPredTimesPriceDF.columns = ['Predicted Y']
fiveDayPrices.columns = ['Y Train']
symbols = [symbol]
unalteredPrices = util.get_data(symbols, dates, addSPY=False)
unalteredPrices = unalteredPrices.dropna()
normalizedDailyPrices = unalteredPrices / unalteredPrices.ix[0, :]
def main():
  trainpercent = 60
  isRandomSplit = False

  filenames = ['data-classification-prob.csv', 'data-ripple-prob.csv']
  outputfilenames = ['plot1.pdf', 'plot2.pdf']
  trainfilenames = ['traintime1.pdf', 'traintime2.pdf']
  testfilenames = ['testtime1.pdf', 'testtime2.pdf']
  methods = ['mean','median']
  
  for index in range(2):
    #read data from data file
    input = np.loadtxt(filenames[index], delimiter=',')
    trainsize = math.floor(input.shape[0]*trainpercent/100)
  
    #split data into train and test sets 
    Xtrain = input[0:trainsize,:-1]
    Ytrain = input[0:trainsize,-1]
    Xtest = input[trainsize:,:-1]
    Ytest = input[trainsize:,-1]
  
    MAXK = 300
    NUMCOLS = 4
    
    meanstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
    medianstats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
    
    avgtraintime = -1
    avgtesttime = -1
  
    for method in methods:
      stats = np.zeros((MAXK, NUMCOLS), dtype=np.float)
      bestcorr = -1000
      bestK = -1
      
      for k in range(1, MAXK+1):
        #instantiate learner and test
        learner = KNNLearner(k, method)
      
        #get start time
        trainstarttime = dt.datetime.now()
        learner.addEvidence(Xtrain, Ytrain)
        #get end time and print total time for adding evidnece
        trainendtime = dt.datetime.now()
      
        #get start time
        teststarttime = dt.datetime.now()
        Y = learner.query(Xtest)
        #get end time and print total time for testing
        testendtime = dt.datetime.now()
    
        #compute corrcoef
        corr = np.corrcoef(Ytest.T, Y.T)
        if corr[0,1] > bestcorr:
          bestcorr = corr[0,1]
          bestK = k
      
        stats[k-1, 0] = k
        stats[k-1, 1] = corr[0,1]
        #The total_seconds() method works in python >= 2.7
        #stats[k-1, 2] = (trainendtime - trainstarttime).total_seconds()/Xtrain.shape[0]
        #stats[k-1, 3] = (testendtime - teststarttime).total_seconds()/Xtest.shape[0]
        stats[k-1, 2] = gettotalseconds(trainstarttime, trainendtime)/Xtrain.shape[0]
        stats[k-1, 3] = gettotalseconds(teststarttime, testendtime)/Xtest.shape[0]
      
        if k == 3 and method == 'mean':
          avgtraintime = stats[k-1,2]
          avgtesttime = stats[k-1,3]

      print 'File:%s Method:%s BestCorrelation:%f K corresponding to best correlation:%f AvgTrainTimeForK3Mean :%f seconds AvgTestTimeForK3Mean:%f seconds'%(filenames[index], method, bestcorr, bestK, avgtraintime, avgtesttime)
    
      if method == 'median':
        medianstats = stats.copy()
      else: 
        meanstats = stats.copy()
    
    timedelta = 1

    #Graph for k versus corrcoef
    plt.cla()
    plt.clf()
    plt.plot(meanstats[:,0], meanstats[:,1], color='r')
    plt.plot(medianstats[:,0], medianstats[:,1], color='b')
    plt.legend(('method=mean', 'method=median'), loc='upper right')
    plt.ylabel('Correlation Coefficient')
    plt.xlabel('k')
    plt.savefig(outputfilenames[index],format='pdf')