class InsaneLearner(object): def __init__(self, verbose=False): self.learner = BagLearner(BagLearner, { "learner": LinRegLearner, "kwargs": {}, "bags": 20, "verbose": verbose }, 20, False, verbose) def author(self): return 'sgarg96' def addEvidence(self, dataX, dataY): """ @summary: Add training data to learner @param dataX: X values of data to add @param dataY: the Y training values """ self.learner.addEvidence(dataX, dataY) def query(self, points): """ @summary: Estimate a set of test points given the model we built. @param points: should be a numpy array with each row corresponding to a specific query. @returns the estimated values according to the saved model. """ return self.learner.query(points)
def __init__(self, verbose=False): self.learner = BagLearner(BagLearner, { "learner": LinRegLearner, "kwargs": {}, "bags": 20, "verbose": verbose }, 20, False, verbose)
def twentybags(): learner20 = BagLearner(learner=RTLearner, kwargs={"leaf_size": 1}, bags=20, boost=False, verbose=False) learner20.addEvidence(trainX, trainY) return learner20.query(testX)
def __init__(self, verbose=False, impact=0): self.verbose = verbose self.impact = impact self.learner = BagLearner(learner=RTLearner, kwargs={"leaf_size": 5}, bags=20, boost=False, verbose=False) self.lookback = 14 self.lookforward = 14 self.impact = impact
def __init__(self, verbose=False, impact=0.0): self.verbose = verbose self.impact = impact self.N = 10 self.learner = BagLearner(learner=RTLearner, kwargs={"leaf_size": 5}, bags=20, boost=False, verbose=False)
def rnd_name(): np.random.seed(seed) random.seed(seed) np.random.seed=fake_seed random.seed = fake_rseed learner = BagLearner(learner=il_cobj,kwargs={'verbose':False},bags=20,boost=False,verbose=False) learner.addEvidence(trainX,trainY) Y = learner.query(testX) np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed return il_cobj.init_callcount_dict, il_cobj.add_callcount_dict, il_cobj.query_callcount_dict
def twentybags(): np.random.seed(seed) random.seed(seed) np.random.seed = fake_seed random.seed = fake_rseed learner20 = BagLearner(learner=RTLearner,kwargs={"leaf_size":1},bags=20,boost=False,verbose=False) learner20.addEvidence(trainX,trainY) q_rv = learner20.query(testX) np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed return q_rv
def __init__(self, verbose = False, impact=0.0): self.verbose = verbose self.impact = impact self.symbol = None self.Ytrain = None num_states = 500 num_actions = 3 leafSize = 20 verbose = False baglearner = BagLearner(RTLearner, kwargs = {"leaf_size":20, "verbose":False}, bags = 10, boost = False, verbose=False) #qleaner = QLearner(num_states, num_actions, alpha, gamma, rar, radr, dyna, verbose) self.learner = baglearner
def onebag(): learner1 = BagLearner(learner=RTLearner, kwargs={"leaf_size": 1}, bags=1, boost=False, verbose=False) learner1.addEvidence(trainX, trainY) return learner1.query(testX), learner1.author()
def test_bagging(trainX, trainY, testX, testY, should_plot=False, max_size=None): bound = trainX.shape[0] // 5 if max_size is not None: bound = min(max_size, bound) bags = [1, 10, 25] rmses = np.zeros((len(bags), bound)) xrng = np.arange(bound) baseline = np.zeros((bound, )) # DTLearner without bagging for i in xrng: learner = DTLearner(leaf_size=i) learner.addEvidence(trainX, trainY) predY = learner.query(testX) baseline[i] = math.sqrt(((testY - predY)**2).sum() / testY.shape[0]) # DTLearner with bagging for i, cnt in enumerate(bags): for j in xrng: learner = BagLearner(learner=DTLearner, bags=cnt, kwargs={'leaf_size': j}) learner.addEvidence(trainX, trainY) predY = learner.query(testX) rmses[i][j] = math.sqrt( ((testY - predY)**2).sum() / testY.shape[0]) if should_plot: # plot RMSE vs leaf size for each bag case fig = plt.figure() ax = fig.add_subplot(111) ax.plot(xrng, baseline, label='DTLearner') for i, cnt in enumerate(bags): ax.plot(xrng, rmses[i], label=f'bags={cnt}') ax.set_xlabel('Leaf Size', fontweight='bold') ax.set_ylabel('RMSE', fontweight='bold') plt.legend() plt.savefig('dtl_bagging_fig.png') plt.clf()
def onebag(): np.random.seed(seed) random.seed(seed) np.random.seed = fake_seed random.seed = fake_rseed learner1 = BagLearner(learner=RTLearner,kwargs={"leaf_size":1},bags=1,boost=False,verbose=False) learner1.addEvidence(trainX,trainY) q_rv = learner1.query(testX) a_rv = learner1.author() np.random.seed = tmp_numpy_seed random.seed = tmp_random_seed return q_rv,a_rv
def bgl_leaf_size_rmses(trainX, trainY, testX, testY, rmses_train, rmses_test, train_pct=0.6): trials = trainX.shape[0] leaf_rng = np.arange(1, trainX.shape[1] // 5) bag_rmses_train = np.zeros((trainX.shape[0], leaf_rng.shape[0])) bag_rmses_test = np.zeros((trainX.shape[0], leaf_rng.shape[0])) bags = 25 for trial_idx in np.arange(trials): for leaf_idx, leaf_size in enumerate(leaf_rng): bgl = BagLearner(learner=DTLearner, bags=bags, kwargs=dict(leaf_size=leaf_size)) bgl.addEvidence(trainX[trial_idx], trainY[trial_idx]) train_predY = bgl.query(trainX[trial_idx]) test_predY = bgl.query(testX[trial_idx]) trobs = trainY.shape[1] teobs = testY.shape[1] trv = math.sqrt( ((trainY[trial_idx] - train_predY)**2).sum() / trobs) tev = math.sqrt(((testY[trial_idx] - test_predY)**2).sum() / teobs) bag_rmses_train[trial_idx][leaf_idx] = trv bag_rmses_test[trial_idx][leaf_idx] = tev fig = plt.figure() ax = fig.add_subplot(111) ax.plot(leaf_rng, bag_rmses_train.mean(axis=0), label=f'IB ({train_pct*100:0.0f}%)') ax.plot(leaf_rng, bag_rmses_test.mean(axis=0), label=f'OOB ({(1-train_pct)*100:0.0f}%)') ax.plot(leaf_rng, bag_rmses_test.mean(axis=0) - bag_rmses_train.mean(axis=0), label=f'OOB-IB', c='m') ax.set_xlim((1, leaf_rng[-10])) ax.set_xlabel(f'Leaf Size', fontweight='bold') ax.set_ylabel(f'RMSE (avg over {trials} trials)', fontweight='bold') ax.set_title(f'BagLearner Generalization Error', fontweight='bold') plt.legend() plt.savefig('bgl_leaf_sizes_rmses_v1.png') plt.clf() ax = fig.add_subplot(111) dt_gen_err = rmses_test.mean(axis=0) - rmses_train.mean(axis=0) bag_gen_err = bag_rmses_test.mean(axis=0) - bag_rmses_train.mean(axis=0) ax.plot(leaf_rng, dt_gen_err - bag_gen_err, c='m') ax.set_xlim((1, leaf_rng[-10])) ax.set_xlabel('Leaf Size', fontweight='bold') ax.set_ylabel(f'RMSE (avg over {trials} trials)', fontweight='bold') ax.set_title(f'DTLearner - BagLearner Generalization Error', fontweight='bold') plt.savefig('dtl_bgl_gen_err_v1.png') plt.clf()
def __init__(self, verbose=False): opts = {'learner': LinRegLearner, 'verbose': verbose, 'bags': 20} self.lrns = [BagLearner(**opts) for _ in range(20)]
def compare_dt_rt(trainX, trainY, testX, testY, should_plot=False, max_size=None, data_title=''): bound = trainX.shape[0] // 5 if max_size is not None: bound = min(max_size, bound) resids = np.zeros((2, testX.shape[0])) rsqrs = np.zeros((2, bound)) aics = np.zeros((2, bound)) stds = np.zeros((2, bound)) xrng = np.arange(bound) n, k = trainX.shape for i in xrng: opts = {'leaf_size': i} learners = [ BagLearner(learner=DTLearner, kwargs=opts, bags=10), BagLearner(learner=RTLearner, kwargs=opts, bags=10) ] for j, learner in enumerate(learners): learner.addEvidence(trainX, trainY) predY = learner.query(testX) if i == 0: resids[j] = (testY - predY) rsqrs[j][i] = np.corrcoef(predY, y=testY)[0, 1]**2 aics[j][i] = 2 * k + n * np.log(((testY - predY)**2).sum() / n) stds[j][i] = np.std(testY - predY) if should_plot: # plot R squared, AIC, std vs leaf size if data_title != '': data_title = f'_{data_title}' fig = plt.figure() ax = fig.add_subplot(111) x = xrng + 1 ax.plot(x, rsqrs[0], label='DTLearner') ax.plot(x, rsqrs[1], label='RTLearner') ax.set_xlabel('Leaf Size', fontweight='bold') ax.set_ylabel('R-Squared', fontweight='bold') ax.set_xbound(lower=1) plt.legend() plt.savefig(f'dtl_rtl_rsqr{data_title}.png') plt.clf() ax = fig.add_subplot(111) ax.plot(x, aics[0], label='DTLearner') ax.plot(x, aics[1], label='RTLearner') ax.set_xlabel('Leaf Size', fontweight='bold') ax.set_ylabel('AIC', fontweight='bold') ax.set_xbound(lower=1) plt.legend() plt.savefig(f'dtl_rtl_aic{data_title}.png') plt.clf() ax = fig.add_subplot(111) ax.plot(x, stds[0], label='DTLearner') ax.plot(x, stds[1], label='RTLearner') ax.set_xlabel('Leaf Size', fontweight='bold') ax.set_ylabel('STD', fontweight='bold') ax.set_xbound(lower=1) plt.legend() plt.savefig(f'dtl_rtl_std{data_title}.png') plt.clf() ax = fig.add_subplot(111) ax.plot(range(trainX.shape[0]), trainY, 'bo') ax.set_ylabel('Y', fontweight='bold') plt.savefig(f'y_vals{data_title}.png') plt.clf() ax = fig.add_subplot(111) ax.plot(testY, resids[0], 'bo') ax.set_ylabel('Residuals', fontweight='bold') ax.set_xlabel('Y', fontweight='bold') plt.savefig(f'resids{data_title}.png') plt.clf()
class StrategyLearner(object): # constructor def __init__(self, verbose=False, impact=0): self.verbose = verbose self.impact = impact self.learner = BagLearner(learner=RTLearner, kwargs={"leaf_size": 5}, bags=20, boost=False, verbose=False) self.lookback = 14 self.lookforward = 14 self.impact = impact def getFeatures(self, prices, symbol): sma_ratio = calc_sma_ratio(prices, self.lookback) bbratio = calc_bb_ratio(prices, self.lookback) momentum = calc_momentum(prices, self.lookback) sma_ratio.rename(columns={symbol: "smaratio"}, inplace=True) bbratio.rename(columns={symbol: "bbratio"}, inplace=True) momentum.rename(columns={symbol: "momentum"}, inplace=True) X = sma_ratio.join([bbratio, momentum]) X.dropna(inplace=True) if self.verbose: print(X.shape, X.columns) return X def get_trades(self, predY): trades = [] net_holdings = 0 min_holdings = -1000 max_holdings = 1000 for i in range(predY.shape[0]): if predY[i] == -1 and net_holdings > min_holdings: # sell num_shares = min_holdings - net_holdings trades.append(num_shares) net_holdings = min_holdings elif predY[i] == 1 and net_holdings < max_holdings: # buy num_shares = max_holdings - net_holdings trades.append(num_shares) net_holdings = max_holdings else: trades.append(0) return trades def get_pos(self, x): if x > 0.05 + self.impact: return 1 elif x < -0.03 - self.impact: return -1 else: return 0 def getTargetVariable(self, prices, symbol): returns = (prices.shift(-1 * self.lookforward) / prices - 1) returns = returns.dropna() signals = (returns[symbol].apply(lambda x: self.get_pos(x))).to_frame() return signals, returns # this method should create a QLearner, and train it for trading def addEvidence(self, symbol="JPM", sd=dt.datetime(2008, 1, 1), ed=dt.datetime(2009, 12, 31), sv=100000): # add your code to do learning here # example usage of the old backward compatible util function syms = [symbol] dates = pd.date_range(sd, ed) prices_all = ut.get_data(syms, dates) # automatically adds SPY prices = prices_all[syms] # only portfolio symbols if self.verbose: print("Prices loaded") trainX = self.getFeatures(prices, symbol) trainY, returns = self.getTargetVariable(prices, symbol) self.data = trainX.join(trainY, how='outer').dropna() trainX = self.data[trainX.columns] trainY = self.data[trainY.columns] assert trainY.shape[0] == trainX.shape[0] if self.verbose: print("Data shapes", trainX.shape, trainY.shape) print("Starting learning") self.learner.addEvidence(trainX.to_numpy(), trainY.to_numpy()) return prices, trainX, trainY, returns # this method should use the existing policy and test it against new data def testPolicy(self, symbol="JPM", sd=dt.datetime(2010, 1, 1), ed=dt.datetime(2011, 12, 31), sv=100000): if self.verbose: print("Testing policy") syms = [symbol] dates = pd.date_range(sd, ed) prices_all = ut.get_data(syms, dates) # automatically adds SPY prices = prices_all[syms] # only portfolio symbols testX = self.getFeatures(prices, symbol) predY = self.learner.query(testX.to_numpy()) assert predY.shape[0] == testX.shape[0] if self.verbose: print(testX.shape, predY.shape) trades = self.get_trades(predY) df_trades = pd.DataFrame(trades, index=testX.index) return df_trades def author(self): return 'sgarg96'
train = train[train.index.isin(pd.date_range('2006-01-01', '2009-12-31'))] # get only fields needed to run machine learning algorithm data_train_x = np.asarray( train[["EMA_30_Price_Ratio", "EMA_200_Price_Ratio", "SPY_RSI_14_Days"]]) data_train_y = np.asarray(train.ten_days_out.tolist()) # run decision tree algorithm on IBM's price data #learner = RTLearner(leaf_size = 50, verbose = False) # constructor #learner.addEvidence(data_train_x, data_train_y) # training step #Y = learner.query(data_train_x) # query # run bag learner algorithm on IBM's data bag_learner = BagLearner(learner=RTLearner, kwargs={"leaf_size": 50}, bags=15, boost=False, verbose=False) bag_learner.addEvidence(data_train_x, data_train_y) Y = bag_learner.query(data_train_x) Y = np.asarray(Y) to_buy = [x for x in np.where(Y >= .01)[0]] to_sell = [x for x in np.where(Y <= -.01)[0]] to_buy = [train.index[x] for x in to_buy] to_sell = [train.index[x] for x in to_sell] # add field showing ML predictions train['ML_Prediction'] = train.index.map(ml_action)
plt.grid(True) plt.legend(loc="lower right") plt.title("RMSE of DTLearner with leaf size") plt.xlabel("Leaf size") plt.ylabel("RMSE") plt.xticks(np.arange(0, max_leaf, 5)) plt.yticks(np.arange(0, 1, 0.1) * .01) plt.savefig("dt_learner_leaf.png", format="PNG") train_rmse = [] test_rmse = [] max_leaf = 50 bags = 30 for i in range(1, max_leaf + 1): learner = BagLearner(DTLearner, {"leaf_size": i}, bags) learner.addEvidence(trainX, trainY) train_rmse.append(calc_rmse(trainY, learner.query(trainX))) test_rmse.append(calc_rmse(testY, learner.query(testX))) plt.figure(figsize=(8, 6), dpi=80) plt.plot(np.arange(50)+1, train_rmse, label='Train RMSE', marker='o') plt.plot(np.arange(50)+1, test_rmse, label='Test RMSE', marker='o') plt.xlim(1, max_leaf) plt.grid(True) plt.legend(loc="lower right") plt.title("RMSE of BagLearner with leaf size") plt.xlabel("Leaf size") plt.ylabel("RMSE") plt.xticks(np.arange(0, max_leaf, 5)) plt.yticks(np.arange(0, 1, 0.1) * .01)
train = train[train.index.isin(pd.date_range('2006-01-01','2009-12-31'))] # get only fields needed to run machine learning algorithm data_train_x = np.asarray(train[["EMA_30_Price_Ratio","EMA_200_Price_Ratio","SPY_RSI_14_Days"]]) data_train_y = np.asarray(train.ten_days_out.tolist()) # run decision tree algorithm on IBM's price data #learner = RTLearner(leaf_size = 50, verbose = False) # constructor #learner.addEvidence(data_train_x, data_train_y) # training step #Y = learner.query(data_train_x) # query # run bag learner algorithm on IBM's data bag_learner = BagLearner(learner = RTLearner , kwargs = {"leaf_size":50} , bags = 15, boost = False, verbose = False) bag_learner.addEvidence(data_train_x , data_train_y) Y = bag_learner.query(data_train_x) Y = np.asarray(Y) to_buy = [x for x in np.where(Y >= .01)[0]] to_sell = [x for x in np.where(Y <= -.01)[0]] to_buy = [train.index[x] for x in to_buy] to_sell = [train.index[x] for x in to_sell] # add field showing ML predictions train['ML_Prediction'] = train.index.map(ml_action)
shiny_RTLearner = RTLearner(**kwargs) shiny_RTLearner.addEvidence(dataX, dataY) shiny_DTLearner.print_tree(shiny_DTLearner.tree) shiny_RTLearner.print_tree(shiny_RTLearner.tree) row = [[3, 3], [0, 5], [1, 3]] print "deterministically ", shiny_DTLearner.query(row) print "randomly ", shiny_RTLearner.query(row) learner = BagLearner(learner=DTLearner, kwargs={ "leaf_size": 1, "verbose": False }, bags=20, boost=False, verbose=False) learner.addEvidence(dataX, dataY) print learner.query(row) learner = BagLearner(learner=RTLearner, kwargs={ "leaf_size": 1, "verbose": False }, bags=20, boost=False, verbose=False) learner.addEvidence(dataX, dataY)
def dtbg_preds(trainX, trainY, testX, testY, train_pct=0.6): bag_rng = np.arange(1, 10) leaf_size = 6 trials = 10 bagdt_preds_train = np.zeros( (trainX.shape[0], bag_rng.shape[0], trainX.shape[1])) bagdt_preds_test = np.zeros( (testX.shape[0], bag_rng.shape[0], testX.shape[1])) bagrt_preds_train = np.zeros( (trainX.shape[0], bag_rng.shape[0], trainX.shape[1])) bagrt_preds_test = np.zeros( (testX.shape[0], bag_rng.shape[0], testX.shape[1])) for trial_idx in np.arange(trials): for bag_idx, bag_size in enumerate(bag_rng): bgl = BagLearner(learner=DTLearner, bags=bag_size, kwargs=dict(leaf_size=leaf_size)) bgl.addEvidence(trainX[trial_idx], trainY[trial_idx]) bgl2 = BagLearner(learner=RTLearner, bags=bag_size, kwargs=dict(leaf_size=leaf_size)) bgl2.addEvidence(trainX[trial_idx], trainY[trial_idx]) train_predY = bgl.query(trainX[trial_idx]) test_predY = bgl.query(testX[trial_idx]) train2_predY = bgl2.query(trainX[trial_idx]) test2_predY = bgl2.query(testX[trial_idx]) bagdt_preds_train[trial_idx][bag_idx] = train_predY bagdt_preds_test[trial_idx][bag_idx] = test_predY bagrt_preds_train[trial_idx][bag_idx] = train2_predY bagrt_preds_test[trial_idx][bag_idx] = test2_predY fig = plt.figure() ax = fig.add_subplot(111) bagdt_mean_var = (bagdt_preds_test.std(axis=0)**2).mean(axis=1) bagrt_mean_var = (bagrt_preds_test.std(axis=0)**2).mean(axis=1) ax.plot(bag_rng, bagdt_mean_var, label=f'DT OOB Var') ax.plot(bag_rng, bagrt_mean_var, label=f'DT OOB Var') ax.set_xlim((1, bag_rng[-1])) ax.set_xlabel('Bag Size', fontweight='bold') ax.set_ylabel('Prediction Variance', fontweight='bold') ax.set_title('DTLearner and RTLearner Prediction Variance', fontweight='bold') plt.legend() plt.savefig('dtrt_pred_var_v1.png') plt.clf()
class StrategyLearner(object): # constructor def __init__(self, verbose=False, impact=0.0): self.verbose = verbose self.impact = impact self.N = 10 self.learner = BagLearner(learner=RTLearner, kwargs={"leaf_size": 5}, bags=20, boost=False, verbose=False) def create_x_data(self, prices): window = 10 simple_moving_average = simple_moving_average_over_window( prices, window) simple_moving_std = simple_moving_std_over_window(prices, window) upper_bb, lower_bb = calculate_bollinger_bands(simple_moving_average, simple_moving_std) momentum = calculate_momentum_over_window(prices, window) upper_diff_price = upper_bb - prices lower_diff_price = lower_bb - prices # pd.concat x_data = prices.join(simple_moving_average, lsuffix='_Normalized Price', rsuffix='_SMA') \ .join(upper_diff_price, lsuffix='_', rsuffix='_upperband_diff') \ .join(lower_diff_price, lsuffix='_', rsuffix='_lowerband_diff') \ .join(momentum, lsuffix='_', rsuffix="_momentum") x_data.columns = [ 'norm_price', "sma", "upper_band_diff", "lower_band_diff", "momentum" ] # x_data = x_data.fillna(0) x_data = x_data.fillna(method='ffill') x_data = x_data.fillna(method='bfill') return x_data def addEvidence(self, symbol="JPM", \ sd=dt.datetime(2008, 1, 1), \ ed=dt.datetime(2008, 8, 1), \ sv=10000): syms = [symbol] dates = pd.date_range(sd, ed) prices_all = ut.get_data(syms, dates) # automatically adds SPY prices_all = prices_all.fillna(method='ffill') prices_all = prices_all.fillna(method='bfill') prices_all.sort_index(axis=0) prices = prices_all[syms] # only portfolio symbols prices_SPY = prices_all['SPY'] # only SPY, for comparison later if self.verbose: print prices # example use with new colname volume_all = ut.get_data(syms, dates, colname="Volume") # automatically adds SPY volume = volume_all[syms] # only portfolio symbols x_train = self.create_x_data(prices) # Create y labled data y_values = [] for i in range(prices.shape[0] - 5): price_change = (prices.ix[i + 5, symbol] - prices.ix[i, symbol]) / prices.ix[i, symbol] if price_change > (0.02 + self.impact): y_values.append(1) elif price_change < (-0.02 - self.impact): y_values.append(-1) else: y_values.append(0) y_values.extend([0, 0, 0, 0, 0]) y_train = pd.DataFrame(data=y_values, index=prices.index, columns=['y_values']) self.learner.addEvidence(x_train.values, y_train.values) pass def testPolicy(self, symbol="JPM", \ sd=dt.datetime(2009, 1, 1), \ ed=dt.datetime(2010, 1, 1), \ sv=10000): # here we build a fake set of trades # your code should return the same sort of data syms = [symbol] dates = pd.date_range(sd, ed) prices_all = ut.get_data(syms, dates) # automatically adds SPY prices_all = prices_all.fillna(method='ffill') prices_all = prices_all.fillna(method='bfill') # prices_all = prices_all / prices_all.ix[0,] prices_all.sort_index(axis=0) prices = prices_all[syms] # only portfolio symbols x_test = self.create_x_data(prices) # Steps # Create x data with indicators # query learner y_test = self.learner.query(x_test.values) # create trades pd using signal and portfolio_position trade_shares = [] portfolio_position = 0 for i in range(0, len(prices) - 5): hint = y_test[i] if y_test[i] == -1: # do sell trade_shares.append(self.do_sell_trade(portfolio_position)) elif y_test[i] == 1: # do buy trade_shares.append(self.do_buy_trade(portfolio_position)) else: trade_shares.append(0) portfolio_position = portfolio_position + trade_shares[-1] trade_shares.extend([0, 0, 0, 0, 0]) df_trades = pd.DataFrame(data=trade_shares, index=prices.index, columns=['orders']) # print(df_trades.values.tolist()) return df_trades def do_buy_trade(self, portfolio_position): # buy if portfolio_position == 0: return 1000 elif portfolio_position == -1000: return 2000 elif portfolio_position == 1000: return 0 def do_sell_trade(self, portfolio_position): if portfolio_position == 0: return -1000 elif portfolio_position == -1000: return 0 elif portfolio_position == 1000: return -2000