Пример #1
0
    def bestBagLearner(self):
        """
		The method builds several BagLearner models based on small training set. 
		The base model in the BagLearner is the KNN learner.
		The k in KNN learner is obtained by the method self.bestKNN()
		These BagLearner models have different n_learners parameter, which denotes number
		of base learners in the bag. 
		The optimal model with the optimal n_learners will be obtained based on validation set.

		@return: optimal integer n_learners, which denotes the number of learners in the bag

		The choices of n_learners are 20, 30, 40, ..., 100
		"""
        k = self.bestKNN()[0]
        optimal_n = 20
        optimal_mape = 1
        for n_learners in range(20, 101, 10):
            bag = BagLearner(learner=KNNLearner,
                             kwargs={"k": k},
                             n_learners=n_learners)
            bag.fit(self.small_trainX_normed, self.small_trainY)
            predicted_roc = bag.predict(self.validationX_normed)
            predicted_prices = self.validationPrices * (1 + predicted_roc)
            mape = np.mean(
                np.abs(predicted_prices - self.validationFuturePrices) /
                self.validationFuturePrices)
            if mape < optimal_mape:
                optimal_mape = mape
                optimal_n = n_learners

        return optimal_n, optimal_mape
Пример #2
0
def run_exp_2():
    trainX, trainY, testX, testY = read_and_prepare_file()
    train_RMSE_1 = []
    test_RMSE_1 = []
    train_RMSE_2 = []
    test_RMSE_2 = []
    size_range = np.insert(np.arange(5, 101, 5), 0, 1)
    for size in size_range:
        learner_1 = bl.BagLearner(learner=dtl.DTLearner,
                                  kwargs={'leaf_size': size},
                                  bags=10)
        learner_2 = bl.BagLearner(learner=dtl.DTLearner,
                                  kwargs={'leaf_size': size},
                                  bags=5)
        learner_1.addEvidence(trainX, trainY)
        learner_2.addEvidence(trainX, trainY)

        predY = learner_1.query(trainX)
        rmse = math.sqrt(((trainY - predY) ** 2).sum() / trainY.shape[0])
        train_RMSE_1.append(rmse)
        predY = learner_1.query(testX)
        rmse = math.sqrt(((testY - predY) ** 2).sum() / testY.shape[0])
        test_RMSE_1.append(rmse)

        predY = learner_2.query(trainX)
        rmse = math.sqrt(((trainY - predY) ** 2).sum() / trainY.shape[0])
        train_RMSE_2.append(rmse)
        predY = learner_2.query(testX)
        rmse = math.sqrt(((testY - predY) ** 2).sum() / testY.shape[0])
        test_RMSE_2.append(rmse)

    plt.figure(figsize=(10, 5))
    plt.title('Bag with DT: RMSE vs leaf_size (10 bags)')
    plt.xlabel('leaf_size')
    plt.ylabel('RMSE')
    plt.plot(size_range, train_RMSE_1, '.-', label='train_RMSE')
    plt.plot(size_range, test_RMSE_1, '.-', label='test_RMSE')
    plt.xticks(np.insert(np.arange(5, 101, 5), 0, 1))
    plt.grid()
    plt.legend()
    plt.savefig('exp2_fig1.png')
    # plt.show()

    plt.figure(figsize=(10, 5))
    plt.title('Bag with DT: RMSE vs leaf_size (5 bags)')
    plt.xlabel('leaf_size')
    plt.ylabel('RMSE')
    plt.plot(size_range, train_RMSE_2, '.-', label='train_RMSE')
    plt.plot(size_range, test_RMSE_2, '.-', label='test_RMSE')
    plt.xticks(np.insert(np.arange(5, 101, 5), 0, 1))
    plt.grid()
    plt.legend()
    plt.savefig('exp2_fig2.png')
 def __init__(self, verbose=False, impact=0.0):
     self.verbose = verbose
     self.impact = impact
     self.learner = bl.BagLearner(learner=rt.RTLearner,
                                  kwargs={"leaf_size": 5},
                                  bags=20,
                                  boost=False,
                                  verbose=False)
     self.learner2 = bl.BagLearner(learner=dt.DTLearner,
                                   kwargs={"leaf_size": 5},
                                   bags=20,
                                   boost=False,
                                   verbose=False)
Пример #4
0
def experiment_2(train_x, train_y, test_x, test_y, arbitrary_learner=dtl.DTLearner, low=1, high=100, bags=20, **kwargs):
    insample_rmse=[]
    outsample_rmse = []
    leaf_sizes = list(range(low, high+1))
    np.random.seed(987654321)
    for i in leaf_sizes:
        learner = bl.BagLearner(learner=arbitrary_learner, kwargs={"leaf_size": i, **kwargs}, bags=bags, verbose=False)
        learner.add_evidence(train_x, train_y)

        pred_y = learner.query(train_x)
        insample_rmse.append(math.sqrt(((train_y - pred_y) ** 2).sum() / train_y.shape[0]))

        pred_y = learner.query(test_x)
        outsample_rmse.append(math.sqrt(((test_y - pred_y) ** 2).sum() / test_y.shape[0]))
    plt.figure(3)
    plt.plot(leaf_sizes, insample_rmse, color='tab:blue', label="insample")
    plt.plot(leaf_sizes, outsample_rmse, color='tab:orange', label="outsample")
    plt.xlabel("Leaf Size")
    plt.ylabel("RMSE")
    plt.title("Effect of Bag Learner on overfitting.")
    plt.legend()
    plt.savefig("experiment2a.png")

    plt.figure(4)
    plt.plot(leaf_sizes, insample_rmse, color='tab:blue', label="insample")
    plt.plot(leaf_sizes, outsample_rmse, color='tab:orange', label="outsample")
    plt.xlabel("Leaf Size")
    plt.ylabel("RMSE")
    plt.title("Effect of Bag Learner on overfitting.")
    plt.xlim(5, 12)
    plt.legend()
    plt.savefig("experiment2b.png")
Пример #5
0
    def addEvidence(self,
                    symbol="AAPL",
                    sd=dt.datetime(2008, 1, 1),
                    ed=dt.datetime(2009, 12, 31),
                    sv=100000):
        leaf_size, bags, days, yBuy, ySell = 5, 20, 10, 0.04, -0.04

        prices = get_data([symbol], pd.date_range(sd, ed))[[symbol]]
        nDayReturns = (prices.shift(-days) / prices) - 1.0

        # Create x data for training by combining price/SMA and Bollinger calculations
        # Decided not to use momentum since it is not very reliable in predicting price trends
        dataX = pd.concat(
            [prices / sma(prices), bb(prices)], axis=1)[:-days].values

        # Every entry in y data is -1, 0, or 1 based on how associated nDayReturns values compares to buy/sell thresholds
        dataY = []
        for index, row in nDayReturns.iterrows():
            entry = row[nDayReturns.columns.values[0]]
            dataY.append(1.0 if entry > (self.impact + yBuy) else (
                -1.0 if entry < (ySell - self.impact) else 0.0))

        self.learner = bl.BagLearner(rt.RTLearner, {'leaf_size': leaf_size},
                                     bags, False, False)

        self.learner.addEvidence(dataX, np.asarray(dataY))
Пример #6
0
    def __init__(self, verbose=False, impact=0.0, flag=0):
        self.verbose = verbose
        self.impact = impact
        self.ybuy = 0.001 + self.impact
        self.ysell = -0.001 - self.impact
        self.lookback = 50
        self.N = 30
        self.leaf_size = 10
        self.bags = 15

        if flag == 1:  # exp 1
            self.ybuy = 0.001 + self.impact
            self.ysell = -0.001 - self.impact
            self.lookback = 50
            self.N = 30
            self.leaf_size = 10
            self.bags = 15

        if flag == 2:  # exp 2
            self.ybuy = 0.002 + self.impact
            self.ysell = -0.002 - self.impact
            self.lookback = 50
            self.N = 10
            self.leaf_size = 20
            self.bags = 15

        self.baggy = bl.BagLearner(learner=rt.RTLearner, \
            kwargs={'leaf_size':self.leaf_size}, bags=self.bags)
Пример #7
0
 def query(self, xTest):
     preds = []
     for bl in self.bag_learners:
         pred = bl.query(xTest)
         preds.append(pred)
     preds = np.mean(np.asarray(preds), axis=0)
     return preds
Пример #8
0
def problem_2(max_bag_size, fixed_leaf_size, file_path, output_file):
    # Get data.
    trainX, trainY, testX, testY = get_data(file_path)

    # Build a comma separated result of Bag Size, in sample RMSE, out of sample RMSE.
    output = open(output_file + '.csv', 'w')
    output.write('Bag Size,In Sample RMSE,Out of Sample RMSE\n')

    # What bag size does overfitting occur?
    for i in range(1, max_bag_size + 1):
        print 'Bag size: ' + str(i)
        # Create a learner and train it.
        learner = bl.BagLearner(learner=rtl.RTLearner,
                                kwargs={"leaf_size": fixed_leaf_size},
                                bags=i,
                                boost=False,
                                verbose=False)

        # Train the learner.
        learner.addEvidence(trainX, trainY)
        #print learner.author()

        # Test it and get error metrics.
        is_rmse, os_rmse = evaluate_samples(learner, trainX, trainY, testX,
                                            testY)

        # Append to output.
        output.write(str(i) + ',' + str(is_rmse) + ',' + str(os_rmse) + '\n')

    # Save output.
    output.close()
Пример #9
0
def experiment2(trainX,trainY,testX,testY):
    """
    Over fitting with respect to leaf_size on DTLearner with bagging
    Plot RMSE for different leaf_sizes
    """
    insample_rmse = []
    outsample_rmse = []
    leaf = []
    for leaf_size in range(1,100):
        leaf.append(leaf_size)
        learner = bl.BagLearner(dt.DTLearner,kwargs={"leaf_size":leaf_size}, verbose = True) 
        learner.addEvidence(trainX, trainY) 
        
        predY = learner.query(trainX) # get the predictions                                                                                                                  
        rmse = math.sqrt(((trainY - predY) ** 2).sum()/trainY.shape[0])                                                                                                                  
        insample_rmse.append(rmse)        
        
        predY = learner.query(testX) # get the predictions                                                                                                                  
        rmse = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])                                                                                                                  
        outsample_rmse.append(rmse)
    
    plt.figure()
    plt.plot(leaf,insample_rmse,'c',label="insample")
    plt.plot(leaf,outsample_rmse,'g',label="outsample")
    plt.legend()
    plt.xlim(0,100)
    
    plt.xlabel("leaf_size")
    plt.ylabel("RMSE")
    plt.title("Experiment2")
    plt.savefig("Experiment2.png")
Пример #10
0
 def __init__(self, verbose):
     self.verbose = verbose
     self.learner = bag.BagLearner(learner=lrl.LinRegLearner,
                                   kwargs={},
                                   bags=20,
                                   boost=False,
                                   verbose=False)
Пример #11
0
 def __init__(self, verbose=False):
     self.learner = bgl.BagLearner(bgl.BagLearner,
                                   kwargs={
                                       "learner": lrl.LinRegLearner,
                                       "bags": 20
                                   },
                                   bags=20)
 def __init__(self, verbose=False):
     self.learners = []
     self.verbose = verbose
     self.num = 20
     for i in range(self.num):
         self.learners.append(
             bl.BagLearner(lrl.LinReglearner, {}, 20, False, False))
 def __init__(self, leaf_size=1, verbose=True):
     for i in range(20):
         self.learner = bl.BagLearner(learner=lrl.LinRegLearner,
                                      kwargs={},
                                      bags=20,
                                      boost=False,
                                      verbose=False)
Пример #14
0
def predict_outsamp(X_trn, y_trn, X_tst, y_tst, symbol, start_date, end_date, k):
    # create a linear regression learner and train it
    lrlearner = lrl.LinRegLearner() # create a LinRegLearner
    lrlearner.addEvidence(X_trn, y_trn) # train it
    ytst_lr = lrlearner.query(X_tst)

    # create a KNN learner (k=10) and train it    
    knnlearn = knn.KNNLearner(k) # constructor
    knnlearn.addEvidence(X_trn, y_trn) # training step
    ytst_knn = knnlearn.query(X_tst)    

    # create a Bag learner and train it    
    baglearn = bl.BagLearner(learner = knn.KNNLearner, kwargs = {"k":k}, bags = 100, boost = False) # constructor
    baglearn.addEvidence(X_trn, y_trn) # training step
    ytst_bag = baglearn.query(X_tst)

    # Combine all models
    combined = (ytst_lr+ytst_knn+ytst_bag)/3
    
    print ""
    print "Out of sample predictions for %s data from %s to %s"  %(symbol[0], start_date, end_date)
    print "KNN RMSE %0.4f; LinReg RMSE %0.4f; BagReg RMSE %0.4f; Combined RMSE %0.4f" %(rmse(y_tst, ytst_knn), rmse(y_tst, ytst_lr), rmse(y_tst, ytst_bag), rmse(y_tst, combined))
    print "KNN corr %0.4f; LinReg corr %0.4f; BagReg corr %0.4f" %(np.corrcoef(y_tst, ytst_knn)[0,1], np.corrcoef(y_tst, ytst_lr)[0,1], np.corrcoef(y_tst, ytst_bag)[0,1])
    print "KNN mean %0.4f; LinReg mean %0.4f; BagReg mean %0.4f" %(abs(y_tst - ytst_knn).mean(), abs(y_tst - ytst_lr).mean(), abs(y_tst - ytst_bag).mean())
    print "Actual mean 5 day change %0.4f" %abs(y_tst).mean()
    print ""
    return ytst_lr, ytst_knn, ytst_bag
Пример #15
0
 def __init__(self, verbose=False):
     self.learner = bl.BagLearner(bl.BagLearner, {
         'learner': lrl.LinRegLearner,
         'kwargs': {},
         'bags': 20
     },
                                  bags=20)
def run_debuging_tests():
    inf = open(sys.argv[1])
    data = np.array(
        [map(float,
             s.strip().split(',')) for s in inf.readlines()])
    # print data.shape[0]
    data = np.random.permutation(data)
    # compute how much of the data is training and testing
    np.random.shuffle(data)
    train_rows = int(0.6 * data.shape[0])
    test_rows = data.shape[0] - train_rows
    # separate out training and testing data
    trainX = data[:train_rows, 0:-1]
    trainY = data[:train_rows, -1]
    testX = data[train_rows:, 0:-1]
    testY = data[train_rows:, -1]
    # create a learner and train it
    learner = lrl.LinRegLearner(verbose=True)  # create a LinRegLearner
    run_learner(learner, trainX, trainY, testX, testY)  # training step
    learner = dt.DTLearner(leaf_size=1, verbose=False)  # constructor
    run_learner(learner, trainX, trainY, testX, testY)  # training step
    learner = rt.RTLearner(leaf_size=1, verbose=False)  # constructor
    run_learner(learner, trainX, trainY, testX, testY)
    learner = bl.BagLearner(learner=lrl.LinRegLearner,
                            kwargs={},
                            bags=10,
                            boost=False,
                            verbose=False)
    run_learner(learner, trainX, trainY, testX, testY)
    learner = il.InsaneLearner(verbose=False)
    run_learner(learner, trainX, trainY, testX, testY)
Пример #17
0
    def addEvidence(self,
                    symbol="IBM",
                    sd=dt.datetime(2008, 1, 1),
                    ed=dt.datetime(2009, 1, 1),
                    sv=10000,
                    impact=0):
        # add your classCode to do learning here

        # N = 5  # Number of day returns to use
        # YSELL = -0.01
        # YBUY = 0.01

        leaf_size = 3  # Leaf size for random tree learner
        n_bags = 20  # Number of bags for baglearner

        syms = [symbol]
        prices_all = util.get_data(syms, pd.date_range(sd, ed))
        prices = prices_all[syms]

        # Calculate indicators and features
        df_X, df_Y = get_X_and_Y(prices, -0.01, 0.01, 7, self.impact)

        Xtrain = df_X.values
        Ytrain = df_Y.values

        self.learner = BagLearner.BagLearner(learner=RTLearner.RTLearner,
                                             kwargs={'leaf_size': leaf_size},
                                             bags=n_bags,
                                             boost=False)
        self.learner.addEvidence(Xtrain, Ytrain)
 def __init__(self, verbose):
     learner_list = []
     no_of_bags = 20
     for i in range(no_of_bags):
         learner_list.append(bl.BagLearner(learner=lrl.LinRegLearner, kwargs={}, bags=20,verbose=verbose))
     self.learner_list = learner_list
     self.no_of_bags = no_of_bags
Пример #19
0
    def addEvidence(self, symbol = "IBM", \
        sd=dt.datetime(2008,1,1), \
        ed=dt.datetime(2009,1,1), \
        sv = 10000):

        N = 10  #Number of day returns to use
        YSELL = -0.01
        YBUY = 0.01

        leaf_size = 5  #Leaf size for random tree learner
        n_bags = 10  #Number of bags for baglearner

        syms = [symbol]
        dates = pd.date_range(sd, ed)
        prices_all = ut.get_data(syms, dates)
        prices = prices_all[syms]

        #Calculate indicators and Y dataframes
        df_X = ind.IndicatorsFrame(prices)
        df_returns = ind.NDayReturns(prices, N)
        df_Y = ind.YFrame(df_returns, YSELL - self.impact, YBUY + self.impact)

        Xtrain = df_X.values
        Ytrain = df_Y.values

        self.learner = bl.BagLearner(learner=rt.RTLearner,
                                     kwargs={'leaf_size': leaf_size},
                                     bags=n_bags,
                                     boost=False,
                                     verbose=False)
        self.learner.addEvidence(Xtrain, Ytrain)
    def addEvidence(self, symbol = "JPM", \
        sd=dt.datetime(2008,1,1), \
        ed=dt.datetime(2009,1,1), \
        sv = 10000):

        syms = [symbol]
        dates = pd.date_range(sd, ed)
        prices_all = ut.get_data(syms, dates)  # automatically adds SPY
        prices = prices_all[syms]  # only portfolio symbols
        if self.verbose: print prices
        self.learner = bl.BagLearner(learner=rtl.RTLearner,
                                     kwargs={
                                         "leaf_size": 5,
                                         "verbose": False
                                     },
                                     bags=12,
                                     boost=False,
                                     verbose=False)
        psma, bbp, trix = ind.indicators(sd, ed, syms, 14)
        DataX = np.hstack([np.array(psma), np.array(bbp), np.array(trix)])
        trixarray = np.array(trix)
        NANS = 0
        for i in trixarray:
            if np.isnan(i):
                NANS += 1

        N = 10
        DataY = np.array(prices)
        siftedY = np.array(prices.shift(-N))
        DataY = siftedY / DataY - 1
        finalX = DataX[NANS:-N, :]
        finalY = DataY[NANS:-N, :]
        self.model = self.learner.addEvidence(finalX, finalY)
        return self.model
def training_data(train_data, test_data):

    trainX = train_data.values[:, 0:-1]
    trainY = train_data.values[:, -1]
    testX = test_data.values[:, 0:-1]
    testY = test_data.values[:, -1]

    learner = bl.BagLearner(learner=rtl.RTLearner,
                            kwargs={'leaf_size': 5},
                            bags=20,
                            verbose=False)
    learner.addEvidence(trainX, trainY)  # train it

    # evaluate in sample
    train_predY = learner.query(trainX)  # get the predictions
    #print predY
    #print len(predY)
    rmse = math.sqrt(((trainY - train_predY)**2).sum() / trainY.shape[0])

    print "In sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(train_predY, y=trainY)
    print "corr: ", c[0, 1]

    # evaluate out of sample
    test_predY = learner.query(testX)  # get the predictions
    rmse = math.sqrt(((testY - test_predY)**2).sum() / testY.shape[0])

    print "Out of sample results"
    print "RMSE: ", rmse
    c = np.corrcoef(test_predY, y=testY)
    print "corr: ", c[0, 1]
    return train_predY, test_predY
    def addEvidence(self, symbol = "IBM", \
        sd=dt.datetime(2008,1,1), \
        ed=dt.datetime(2009,1,1), \
        sv = 10000):

        # add your code to do learning here
        self.start_date = sd - dt.timedelta(days=30)
        self.end_date = ed
        self.sv = sv
        self.symbol = []
        self.symbol.append(symbol)

        price = self.createIndicators()
        #print price.ix[:, 1:(price.shape[1]-1)]

        trainX = np.array(price.ix[:, 1:(price.shape[1] - 1)])
        trainY = np.array(price.ix[:, (price.shape[1] - 1)])

        leaf = 5
        bag = 25
        learner = bl.BagLearner(learner=rt.RTLearner,
                                kwargs={"leaf_size": leaf},
                                bags=bag,
                                boost=False,
                                verbose=False)  # constructor
        #learner = dl.DTLearner(leaf_size = 5, verbose = False)
        learner.addEvidence(trainX, trainY)

        self.learner = learner
Пример #23
0
 def __init__(self, verbose=False):
     self.verbose = verbose
     self.learner = bl.BagLearner(learner=bl.RTLearner,
                                  kwargs={'leaf_size': 5},
                                  bags=30,
                                  boost=False,
                                  verbose=self.verbose)
Пример #24
0
 def __init__(self, verbose=False):
     self._learners = [
         bl.BagLearner(learner=rl.LinRegLearner,
                       kwargs={},
                       bags=20,
                       boost=False,
                       verbose=False) for _ in range(20)
     ]
 def addEvidence(self, dataX, dataY):
     self.bagLearner = bl.BagLearner(learner=lrl.LinRegLearner,
                                     kwargs={},
                                     bags=20,
                                     boost=False,
                                     verbose=False)
     self.bagLearner.addEvidence(dataX, dataY)
     return self.bagLearner
Пример #26
0
 def get_learner(self,
                 learner=rtl.RTLearner,
                 leaf_size=6,
                 bags=20,
                 boost=False):
     return bg.BagLearner(learner,
                          kwargs={"leaf_size": leaf_size},
                          bags=bags)
Пример #27
0
 def __init__(self, verbose = False, impact=0.0):
     self.verbose = verbose
     self.impact = impact
     self.N = 14
     self.LONG = 1
     self.CASH = 0
     self.SHORT = -1
     self.learner = bl.BagLearner(learner = rt.RTLearner, kwargs = {"leaf_size": 5}, bags = 100, boost = False, verbose = False)
Пример #28
0
def expTwoCorr(trainX, trainY, testX, testY, nam, loop_size=100):
    corrDT = pd.DataFrame(index=np.arange(0, loop_size), columns=["corr"])
    corrRT = corrDT.copy()
    for i in range(0, loop_size):
        bllearner = bl.BagLearner(learner = rt.RTLearner, kwargs = {"leaf_size":i}\
                , bags = 20, boost = False, verbose = False)
        bllearner.addEvidence(trainX, trainY)
        corr_in, c_in, rmse_out, c_out = evaluate(bllearner, trainX, trainY,
                                                  testX, testY)
        corrRT.loc[i] = [c_out]
        bllearner1 = bl.BagLearner(learner = dt.DTLearner, kwargs = {"leaf_size":i}\
                , bags = 20, boost = False, verbose = False)
        bllearner1.addEvidence(trainX, trainY)
        rmse_in, c_in, rmse_out, c_out = evaluate(bllearner1, trainX, trainY,
                                                  testX, testY)
        corrDT.loc[i] = [c_out]
    plot(corrDT, corrRT, titles=nam, name=nam, labelY="corr")
Пример #29
0
 def __init__(self, verbose=False, impact=0.0):
     self.verbose = verbose
     self.impact = impact
     self.ndays = 10
     self.learner = bl.BagLearner(kwargs={"leaf_size": 5},
                                  bags=15,
                                  boost=False,
                                  verbose=False)
Пример #30
0
 def __init__(self, verbose = False, impact=0.0):  		   	  			    		  		  		    	 		 		   		 		  
     self.verbose = verbose  		   	  			    		  		  		    	 		 		   		 		  
     self.impact = impact
     self.window_size = 20
     self.feature_size = 5
     self.N = 10
     bags = 20
     leaf_size = 5
     self.learner = bl.BagLearner(learner = rt.RTLearner, bags = bags, kwargs={"leaf_size":leaf_size})