Exemplo n.º 1
0
    def testCARTPrune(self): 
        numExamples = 500
        X, y = data.make_regression(numExamples)  
        
        y = Standardiser().standardiseArray(y)
        
        numTrain = numpy.round(numExamples * 0.33)     
        numValid = numpy.round(numExamples * 0.33) 
        
        trainX = X[0:numTrain, :]
        trainY = y[0:numTrain]
        validX = X[numTrain:numTrain+numValid, :]
        validY = y[numTrain:numTrain+numValid]
        testX = X[numTrain+numValid:, :]
        testY = y[numTrain+numValid:]
        
        learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2)
        learner.learnModel(trainX, trainY)    
        
        learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2, gamma=1000)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 1000)
        predY = learner.predict(trainX)

        learner.setGamma(200)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 200)
        
        learner.setGamma(100)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 100)
        

        learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2)
        learner.learnModel(trainX, trainY)
        predY2 = learner.predict(trainX)
        
        #Gamma = 0 implies no pruning 
        nptst.assert_array_equal(predY, predY2)
        
        #Full pruning 
        learner = DecisionTreeLearner(pruneType="CART", maxDepth=3, gamma=1)
        learner.learnModel(trainX, trainY)
        self.assertEquals(learner.tree.getNumVertices(), 1)
Exemplo n.º 2
0
 def testCvPrune(self): 
     numExamples = 500
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().standardiseArray(y)
     
     numTrain = numpy.round(numExamples * 0.33)     
     numValid = numpy.round(numExamples * 0.33) 
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     validX = X[numTrain:numTrain+numValid, :]
     validY = y[numTrain:numTrain+numValid]
     testX = X[numTrain+numValid:, :]
     testY = y[numTrain+numValid:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     #print(learner.getTree())
     unprunedTree = learner.tree.copy() 
     learner.setGamma(1000)
     learner.cvPrune(trainX, trainY)
     
     self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices())
     learner.setGamma(100)
     learner.cvPrune(trainX, trainY)
     
     #Test if pruned tree is subtree of current: 
     for vertexId in learner.tree.getAllVertexIds(): 
         self.assertTrue(vertexId in unprunedTree.getAllVertexIds())
         
     #The error should be better after pruning 
     learner.learnModel(trainX, trainY)
     #learner.cvPrune(validX, validY, 0.0, 5)
     learner.repPrune(validX, validY)
   
     error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     self.assertTrue(error1 >= error2)
Exemplo n.º 3
0
 def testPredict(self): 
     
     generator = ExamplesGenerator()         
     
     for i in range(10):        
         numExamples = numpy.random.randint(1, 200)
         numFeatures = numpy.random.randint(1, 20)
         minSplit = numpy.random.randint(1, 50)
         maxDepth = numpy.random.randint(0, 10)
         
         X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         y = numpy.array(y, numpy.float)
             
         learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
         learner.learnModel(X, y)    
         
         predY = learner.predict(X)
         
         tree = learner.tree            
         
         for vertexId in tree.getAllVertexIds(): 
             
             nptst.assert_array_equal(tree.getVertex(vertexId).getTrainInds(), tree.getVertex(vertexId).getTestInds())
             
         #Compare against sklearn tree  
         regressor = DecisionTreeRegressor(min_samples_split=minSplit, max_depth=maxDepth, min_density=0.0)
         regressor.fit(X, y)
         
         sktree = regressor.tree_
         
         #Note that the sklearn algorithm appears to combine nodes with same value 
         #self.assertEquals(sktree.node_count, tree.getNumVertices())
         self.assertEquals(sktree.feature[0], tree.getRoot().getFeatureInd())
         self.assertEquals(sktree.value[0], tree.getRoot().getValue())
         self.assertAlmostEquals(sktree.threshold[0], tree.getRoot().getThreshold(), 3)
         
         predY2 = regressor.predict(X)
         
         #Note that this is not always precise because if two thresholds give the same error we choose the largest 
         #and not sure how it is chosen in sklearn (or if the code is correct)
         self.assertTrue(abs(numpy.linalg.norm(predY-y)- numpy.linalg.norm(predY2-y))/numExamples < 0.05)