def test(self): """Function that tests all depths and method you want. Give it a tester object Returns ------- :self.train_err: numpy array of training errs (row: method, col: depth) :self.test_err: numpy array of test errs (row: method, col: depth) """ # loop through methods and depths for each method for i, method in enumerate(self.methods): for j, d in enumerate(self.depths): # initialize and make decision tree with specified depth and method treeInit = None dt = None treeInit = decisionTree(self.dfTrain, depth=d, method=method, numerical=self.numerical, randTieBreak=self.tie) print('Creating DT with depth limit: {} and method: {}...'. format(d, method)) dt = run_ID3(treeInit) print('Tree complete') # get errors by applying the tree to both train and test sets print('Applying the tree to train and test...') self.train_err[i, j] = self._applyAndError( dt, self.dfTrain, treeInit, numerical=self.numerical) self.test_err[i, j] = self._applyAndError( dt, self.dfTest, treeInit, numerical=self.numerical) print('Applying complete\n') print('Done\n') return self.train_err, self.test_err
def _bagging_loop(self): for t in range(self.T): if (t) % np.round(self.T / 10) == 0: self._bag_progress(t) bootstrap = self.draw_with_replacement() if self.small_sub: tree_init = decisionTree(bootstrap, numerical=self.numerical, small_sub=self.small_sub, globaldf=self.globaldf, randForest=self.randForest, Gsize=self.Gsize) else: tree_init = decisionTree(bootstrap, numerical=self.numerical, randForest=self.randForest, Gsize=self.Gsize) self.treesInit.append(tree_init) run_ID3(tree_init) self._calc_vote(tree_init, t, numerical=self.numerical) if self.verbose: print('100% done.\n')
def _AdaLoop(self, D): print('Starting training...') for t in range(self.T): if (t) % np.round(self.T / 10) == 0: self._progress(t) stump_init = decisionTree(self.data, numerical=True, depth=self.depth, weights=D) run_ID3(stump_init) self.learners_init.append(stump_init) h_t = self._calc_vote(stump_init, t, D, numerical=True) Dtp1 = self._update_weights(D, t, h_t) D = Dtp1 print('Done training\n')
import matplotlib.pyplot as plt import time # %% importing the data and splitting it up cols = list(pd.read_csv('car/data-desc.txt', skiprows=14)) train0 = pd.read_csv('car/train.csv', names=cols) test0 = pd.read_csv('car/test.csv', names=cols) attrTrain0 = np.array(train0.iloc[:, :-1]) attrTest0 = np.array(test0.iloc[:, :-1]) attrNames0 = cols[:-1] labelsTrain0 = np.array(train0.iloc[:, -1]) labelsTest0 = np.array(test0.iloc[:, -1]) # %% training the ID3 algo for testing carTreeInit = decisionTree(train0, method='entropy') carTree = run_ID3(carTreeInit) # %% applying the ID3 algo for testing car_errinit = applyTree(carTree, test0, carTreeInit) errs0, total_err0 = apply_ID3(car_errinit) # %% making trees tic = time.perf_counter() methods = ['entropy', 'ME', 'gini'] datTrain0 = [attrTrain0, labelsTrain0, train0] datTest0 = [attrTest0, labelsTest0, test0] dfs = [train0, test0] depths0 = len(attrNames0) errinit = tester(methods, dfs, depths=depths0)
# %% importing the data cols = [ 'age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y' ] train = pd.read_csv('bank/train.csv', names=cols) test = pd.read_csv('bank/test.csv', names=cols) train_no_unk = replace_unk(train.copy()) test_no_unk = replace_unk(test.copy()) # %% training the ID3 algo for testing tic = time.perf_counter() bankTreeInit = decisionTree(train, numerical=True) bankTree = run_ID3(bankTreeInit) # % applying the ID3 algo for testing errinit = applyTree(bankTree, train, bankTreeInit, numerical=True) errs, total_err = apply_ID3(errinit) toc = time.perf_counter() print('Time for bank code is {:0.4f} seconds.'.format(toc - tic)) # %% making trees tic = time.perf_counter() methods = ['entropy', 'ME', 'gini'] depths = len(train.columns) - 1 dfs = [train, test] errinit = tester(methods, dfs, depths=depths, numerical=True)
# %% def entropy(ps): return -sum([p * np.log2(p) for p in ps]) # %% Training Data # making the training data with columns as x1, x2, x3, x4 attributes = np.array([[0, 0, 0, 1, 0, 1, 0], [0, 1, 0, 0, 1, 1, 1], [1, 0, 1, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 1]]).T # Boolean labels y = np.array([0, 0, 1, 1, 0, 0, 0]).T attrNames = [0, 1, 2, 3] # %% run ID3 on 1a init1 = decisionTree(attributes, attrNames, y, method='gini') tree1 = run_ID3(init1) # %% tennis data attributes2 = np.array( [['S', 'S', 'O', 'R', 'R', 'R', 'O', 'S', 'S', 'R', 'S', 'O', 'O', 'R'], ['H', 'H', 'H', 'M', 'C', 'C', 'C', 'M', 'C', 'M', 'M', 'M', 'H', 'M'], ['H', 'H', 'H', 'H', 'N', 'N', 'N', 'H', 'N', 'N', 'N', 'H', 'N', 'H'], ['W', 'S', 'W', 'W', 'W', 'S', 'S', 'W', 'W', 'W', 'S', 'S', 'W', 'S']]).T y2 = np.array([0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]).T attrNames2 = ['Outlook', 'Temp', 'Humidity', 'Wind'] init2 = decisionTree(attributes2, attrNames2, y2, method='ME') tree2 = run_ID3(init2) # %% ID3 on 3a