def test_dataset4(): ''' (2 points) test dataset4''' n= 400 X, Y = RF.load_dataset() assert X.shape == (16,400) assert Y.shape == (400,) d = DT() # train over half of the dataset t = d.train(X[:,::2],Y[::2]) # test on the other half Y_predict = DT.predict(t,X[:,1::2]) accuracy0 = sum(Y[1::2]==Y_predict)/float(n)*2. print('test accuracy of a decision tree:', accuracy0) b = Bag() # train over half of the dataset T = b.train(X[:,::2],Y[::2],21) # test on the other half Y_predict = Bag.predict(T,X[:,1::2]) accuracy1 = sum(Y[1::2]==Y_predict)/float(n)*2. print('test accuracy of a bagging of 21 trees:', accuracy1) r = RF() # train over half of the dataset T = r.train(X[:,::2],Y[::2],21) # test on the other half Y_predict = RF.predict(T,X[:,1::2]) accuracy2 = sum(Y[1::2]==Y_predict)/float(n)*2. print('test accuracy of a random forest of 21 trees:', accuracy2) assert accuracy1 >= accuracy0 assert accuracy2 >= accuracy0 assert accuracy2 >= accuracy1-.05
def train(self, X, Y, n_tree=11): ''' Given a training set, train a bagging ensemble of decision trees. Input: X: the feature matrix, a numpy matrix of shape p by n. Each element can be int/float/string. Here n is the number data instances in the training set, p is the number of attributes. Y: the class labels, a numpy array of length n. Each element can be int/float/string. n_tree: the number of trees in the ensemble Output: T: a list of the root of each tree, a list of length n_tree. ''' ######################################### # INSERT YOUR CODE HERE T = [] for i in range(n_tree): X_new, Y_new = Bag.bootstrap(X, Y) T.append(DT.train(self, X_new, Y_new)) ######################################### return T