Python findBestSplit示例，sandbox.predictors.TreeCriterion.findBestSplit Python示例

示例#1

0

显示文件

文件： DecisionTreeLearner.py 项目： rezaarmand/sandbox

    def recursiveSplit(self, X, y, argsortX, nodeId):
        """
        Give a sample of data and a node index, we find the best split and 
        add children to the tree accordingly. 
        """
        if len(nodeId) - 1 >= self.maxDepth:
            return

        node = self.tree.getVertex(nodeId)
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(
            self.minSplit, X, y, node.getTrainInds(), argsortX)

        #The split may have 0 items in one set, so don't split
        if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0:
            node.setError(bestError)
            node.setFeatureInd(bestFeatureInd)
            node.setThreshold(bestThreshold)

            leftChildId = self.getLeftChildId(nodeId)
            leftChild = DecisionNode(bestLeftInds, y[bestLeftInds].mean())
            self.tree.addChild(nodeId, leftChildId, leftChild)

            if leftChild.getTrainInds().shape[0] >= self.minSplit:
                self.recursiveSplit(X, y, argsortX, leftChildId)

            rightChildId = self.getRightChildId(nodeId)
            rightChild = DecisionNode(bestRightInds, y[bestRightInds].mean())
            self.tree.addChild(nodeId, rightChildId, rightChild)

            if rightChild.getTrainInds().shape[0] >= self.minSplit:
                self.recursiveSplit(X, y, argsortX, rightChildId)

示例#2

0

显示文件

文件： DecisionTreeLearner.py 项目： charanpald/sandbox

 def recursiveSplit(self, X, y, argsortX, nodeId): 
     """
     Give a sample of data and a node index, we find the best split and 
     add children to the tree accordingly. 
     """
     if len(nodeId)-1 >= self.maxDepth: 
         return 
     
     node = self.tree.getVertex(nodeId)
     bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(self.minSplit, X, y, node.getTrainInds(), argsortX)
 
     #The split may have 0 items in one set, so don't split 
     if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0: 
         node.setError(bestError)
         node.setFeatureInd(bestFeatureInd)
         node.setThreshold(bestThreshold)
         
         leftChildId = self.getLeftChildId(nodeId)
         leftChild = DecisionNode(bestLeftInds, y[bestLeftInds].mean())
         self.tree.addChild(nodeId, leftChildId, leftChild)
         
         if leftChild.getTrainInds().shape[0] >= self.minSplit: 
             self.recursiveSplit(X, y, argsortX, leftChildId)
         
         rightChildId = self.getRightChildId(nodeId)
         rightChild = DecisionNode(bestRightInds, y[bestRightInds].mean())
         self.tree.addChild(nodeId, rightChildId, rightChild)
         
         if rightChild.getTrainInds().shape[0] >= self.minSplit: 
             self.recursiveSplit(X, y, argsortX, rightChildId)

示例#3

0

显示文件

文件： TreeCriterionTest.py 项目： charanpald/sandbox

    def testFindBestSplit2(self): 
        minSplit = 1 
        X = numpy.zeros((20, 10))
        y = numpy.ones(20)
        
        X[0:10, 2] = numpy.arange(10)
        X[10:, 2] = numpy.arange(10)+10 
        y[0:10] = -1 
        
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit2(minSplit, X, y)
        
        
        self.assertEquals(bestError, 0.0)
        self.assertEquals(bestFeatureInd, 2)
        self.assertEquals(bestThreshold, 9.5)
        
        self.assertTrue((bestLeftInds == numpy.arange(0, 10)).all())
        self.assertTrue((bestRightInds == numpy.arange(10, 20)).all())
        
        #Test case where all values are the same 
        X = numpy.zeros((20, 10))
         
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit2(minSplit, X, y)
        self.assertTrue(bestRightInds.shape[0]==0)
        
        #Another simple example 
        X = numpy.random.rand(20, 1)
        y = numpy.random.rand(20)

        inds = [1, 3, 7, 12, 14, 15]
        X[inds, 0] += 10 
        y[inds] += 1   
        
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit2(minSplit, X, y)
        
        for i in range(10): 
            numExamples = numpy.random.randint(1, 200)
            numFeatures = numpy.random.randint(1, 10)
            
            X = numpy.random.rand(numExamples, numFeatures)
            y = numpy.random.rand(numExamples)
            
            bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(minSplit, X, y)
            bestError2, bestFeatureInd2, bestThreshold2, bestLeftInds2, bestRightInds2 = findBestSplit2(minSplit, X, y)
            
            self.assertEquals(bestFeatureInd, bestFeatureInd2)
            self.assertAlmostEquals(bestThreshold, bestThreshold2)
            nptst.assert_array_equal(bestLeftInds, bestLeftInds2)
            nptst.assert_array_equal(bestRightInds, bestRightInds2)

示例#4

0

显示文件

文件： TreeCriterionTest.py 项目： charanpald/sandbox

 def testFindBestSplit3(self): 
     minSplit = 1 
     numExamples = 20
     X = numpy.zeros((numExamples, 2), order="F")
     y = numpy.ones(numExamples)
     
     X[0:10, 0] = numpy.random.permutation(10)
     X[10:, 0] = numpy.random.permutation(10)+10 
     y[0:10] = -1 
     
     argsortX = numpy.zeros(X.shape, numpy.int, order="F")      
     
     for i in range(X.shape[1]): 
         argsortX[:, i] = numpy.argsort(X[:, i])
         argsortX[:, i] = numpy.argsort(argsortX[:, i])
     
     inds = numpy.arange(numExamples-2, dtype=numpy.int)        
     
     print(X)
     print(y) 
     print(inds)
     print(X[inds, :], y[inds])
     
     tempX = X[inds, :]
     tempY = y[inds]
     
     print(tempY[tempX[:, 0]<9.5])
     print(tempY[tempX[:, 0]>9.5])
     
     print("Calling function")
     bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit3(minSplit, X, y, inds, argsortX)
     
     print(bestError, bestFeatureInd, bestThreshold)
     
     bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(minSplit, X, y, inds, argsortX)
     print(bestError, bestFeatureInd, bestThreshold)

示例#5

0

显示文件

文件： TreeCriterionTest.py 项目： charanpald/sandbox

    def testFindBestSplit(self): 
        minSplit = 1 
        
        X = numpy.zeros((20, 10))
        y = numpy.ones(20)
        
        X[0:10, 2] = numpy.arange(10)
        X[10:, 2] = numpy.arange(10)+10 
        y[0:10] = -1 
        
        nodeInds = numpy.arange(X.shape[0])
        argsortX = numpy.zeros(X.shape, numpy.int)      
        
        for i in range(X.shape[1]): 
            argsortX[:, i] = numpy.argsort(X[:, i])
            argsortX[:, i] = numpy.argsort(argsortX[:, i])
        
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(minSplit, X, y, nodeInds, argsortX)
        
        
        self.assertEquals(bestError, 0.0)
        self.assertEquals(bestFeatureInd, 2)
        self.assertEquals(bestThreshold, 9.5)
        
        self.assertTrue((bestLeftInds == numpy.arange(0, 10)).all())
        self.assertTrue((bestRightInds == numpy.arange(10, 20)).all())
        
        #Test case where all values are the same 
        X = numpy.zeros((20, 10))
        
        argsortX = numpy.zeros(X.shape, numpy.int)      
        
        for i in range(X.shape[1]): 
            argsortX[:, i] = numpy.argsort(X[:, i])
            argsortX[:, i] = numpy.argsort(argsortX[:, i])
         
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(minSplit, X, y, nodeInds, argsortX)
        self.assertTrue(bestLeftInds.shape[0]==0)
        self.assertTrue(bestRightInds.shape[0]==X.shape[0])
        
        #Another simple example 
        X = numpy.random.rand(20, 1)
        y = numpy.random.rand(20)

        inds = [1, 3, 7, 12, 14, 15]
        X[inds, 0] += 10 
        y[inds] += 1 
        
        argsortX = numpy.zeros(X.shape, numpy.int)      
        
        for i in range(X.shape[1]): 
            argsortX[:, i] = numpy.argsort(X[:, i])
            argsortX[:, i] = numpy.argsort(argsortX[:, i])
        
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(minSplit, X, y, nodeInds, argsortX)
        nptst.assert_array_equal(bestRightInds, numpy.array(inds))
        
        #Test minSplit 
        minSplit = 10
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(minSplit, X, y, nodeInds, argsortX)
        self.assertTrue(bestLeftInds.shape[0] >= minSplit)
        self.assertTrue(bestRightInds.shape[0] >= minSplit)
        
        #Vary nodeInds 
        minSplit = 1 
        nodeInds = numpy.arange(16)
        
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(minSplit, X, y, nodeInds, argsortX)
        nptst.assert_array_equal(bestRightInds, numpy.array(inds))
        nptst.assert_array_equal(bestLeftInds, numpy.setdiff1d(nodeInds, numpy.array(inds))) 
        
        nodeInds = numpy.arange(10)
        
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(minSplit, X, y, nodeInds, argsortX)
        nptst.assert_array_equal(bestRightInds, numpy.array([1,3,7]))
        nptst.assert_array_equal(bestLeftInds, numpy.setdiff1d(nodeInds, numpy.array([1,3,7])))