예제 #1
0
파일: test4.py 프로젝트: msgonsalves/MLWH2
def test_dataset4():
    ''' (2 points) test dataset4'''
    n= 400
    X, Y = RF.load_dataset()

    assert X.shape == (16,400)
    assert Y.shape == (400,)
    d = DT()
    # train over half of the dataset
    t = d.train(X[:,::2],Y[::2]) 
    # test on the other half
    Y_predict = DT.predict(t,X[:,1::2]) 
    accuracy0 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a decision tree:', accuracy0)

    b = Bag()
    # train over half of the dataset
    T = b.train(X[:,::2],Y[::2],21) 
    # test on the other half
    Y_predict = Bag.predict(T,X[:,1::2]) 
    accuracy1 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a bagging of 21 trees:', accuracy1)

    r = RF()
    # train over half of the dataset
    T = r.train(X[:,::2],Y[::2],21) 
    # test on the other half
    Y_predict = RF.predict(T,X[:,1::2]) 
    accuracy2 = sum(Y[1::2]==Y_predict)/float(n)*2. 
    print('test accuracy of a random forest of 21 trees:', accuracy2)
    assert accuracy1 >= accuracy0
    assert accuracy2 >= accuracy0
    assert accuracy2 >= accuracy1-.05
예제 #2
0
 def train(self, X, Y, n_tree=11):
     '''
         Given a training set, train a bagging ensemble of decision trees.
         Input:
             X: the feature matrix, a numpy matrix of shape p by n.
                Each element can be int/float/string.
                Here n is the number data instances in the training set, p is the number of attributes.
             Y: the class labels, a numpy array of length n.
                Each element can be int/float/string.
             n_tree: the number of trees in the ensemble
         Output:
             T: a list of the root of each tree, a list of length n_tree.
     '''
     #########################################
     # INSERT YOUR CODE HERE
     T = []
     for i in range(n_tree):
         X_new, Y_new = Bag.bootstrap(X, Y)
         T.append(DT.train(self, X_new, Y_new))
     #########################################
     return T