def testMode(self): x = numpy.array([1,1,1,2,2,3,3,3,3,3,5,5]) self.assertEquals(Util.mode(x), 3) x = numpy.array([1,1,1,2,2,3,3,3,5,5]) self.assertEquals(Util.mode(x), 1) x = numpy.array([1,2,3,4]) self.assertEquals(Util.mode(x), 1) x = numpy.array([0]) self.assertEquals(Util.mode(x), 0)
def testMode(self): x = numpy.array([1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 5, 5]) self.assertEquals(Util.mode(x), 3) x = numpy.array([1, 1, 1, 2, 2, 3, 3, 3, 5, 5]) self.assertEquals(Util.mode(x), 1) x = numpy.array([1, 2, 3, 4]) self.assertEquals(Util.mode(x), 1) x = numpy.array([0]) self.assertEquals(Util.mode(x), 0)
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] < bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange( nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] >= bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum( ) != 0 and self.tree.depth() < self.maxDepth: node.setError(1 - accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]<bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]>=bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0 and self.tree.depth() < self.maxDepth: node.setError(1-accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def learnModel(self, X, y): if numpy.unique(y).shape[0] != 2: raise ValueError("Must provide binary labels") if y.dtype != numpy.int: raise ValueError("Labels must be integers") self.shapeX = X.shape argsortX = numpy.zeros(X.shape, numpy.int) for i in range(X.shape[1]): argsortX[:, i] = numpy.argsort(X[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) rootId = (0,) idStack = [rootId] self.tree = DictTree() rootNode = DecisionNode(numpy.arange(X.shape[0]), Util.mode(y)) self.tree.setVertex(rootId, rootNode) bestError = float("inf") bestTree = self.tree #First grow a selection of trees while len(idStack) != 0: #Prune the current node away and grow from that node nodeId = idStack.pop() for i in range(self.sampleSize): self.tree = bestTree.deepCopy() try: node = self.tree.getVertex(nodeId) except ValueError: print(nodeId) print(self.tree) raise self.tree.pruneVertex(nodeId) self.growTree(X, y, argsortX, nodeId) self.prune(X, y) error = self.treeObjective(X, y) if error < bestError: bestError = error bestTree = self.tree.deepCopy() children = bestTree.children(nodeId) idStack.extend(children) self.tree = bestTree
def learnModel(self, X, y): if numpy.unique(y).shape[0] != 2: raise ValueError("Must provide binary labels") if y.dtype != numpy.int: raise ValueError("Labels must be integers") self.shapeX = X.shape argsortX = numpy.zeros(X.shape, numpy.int) for i in range(X.shape[1]): argsortX[:, i] = numpy.argsort(X[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) rootId = (0, ) idStack = [rootId] self.tree = DictTree() rootNode = DecisionNode(numpy.arange(X.shape[0]), Util.mode(y)) self.tree.setVertex(rootId, rootNode) bestError = float("inf") bestTree = self.tree #First grow a selection of trees while len(idStack) != 0: #Prune the current node away and grow from that node nodeId = idStack.pop() for i in range(self.sampleSize): self.tree = bestTree.deepCopy() try: node = self.tree.getVertex(nodeId) except ValueError: print(nodeId) print(self.tree) raise self.tree.pruneVertex(nodeId) self.growTree(X, y, argsortX, nodeId) self.prune(X, y) error = self.treeObjective(X, y) if error < bestError: bestError = error bestTree = self.tree.deepCopy() children = bestTree.children(nodeId) idStack.extend(children) self.tree = bestTree
def testPrune(self): startId = (0, ) minSplit = 20 maxDepth = 5 gamma = 0.05 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) learner.growTree(trainX, trainY, argsortX, startId) learner.shapeX = trainX.shape learner.predict(trainX, trainY) learner.computeAlphas() obj1 = learner.treeObjective(trainX, trainY) size1 = learner.tree.getNumVertices() #Now we'll prune learner.prune(trainX, trainY) obj2 = learner.treeObjective(trainX, trainY) size2 = learner.tree.getNumVertices() self.assertTrue(obj1 >= obj2) self.assertTrue(size1 >= size2) #Check there are no nodes with alpha>alphaThreshold for vertexId in learner.tree.getAllVertexIds(): self.assertTrue( learner.tree.getVertex(vertexId).alpha <= learner.alphaThreshold)
def testPrune(self): startId = (0, ) minSplit = 20 maxDepth = 5 gamma = 0.05 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) learner.growTree(trainX, trainY, argsortX, startId) learner.shapeX = trainX.shape learner.predict(trainX, trainY) learner.computeAlphas() obj1 = learner.treeObjective(trainX, trainY) size1 = learner.tree.getNumVertices() #Now we'll prune learner.prune(trainX, trainY) obj2 = learner.treeObjective(trainX, trainY) size2 = learner.tree.getNumVertices() self.assertTrue(obj1 >= obj2) self.assertTrue(size1 >= size2) #Check there are no nodes with alpha>alphaThreshold for vertexId in learner.tree.getAllVertexIds(): self.assertTrue(learner.tree.getVertex(vertexId).alpha <= learner.alphaThreshold)
def learnModel(self, X, y): """ Basically figure out the majority label """ self.majorLabel = Util.mode(y)
def testGrowTree(self): startId = (0, ) minSplit = 20 maxDepth = 3 gamma = 0.01 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) #Note that this matches with the case where we create a new tree each time numpy.random.seed(21) bestError = float("inf") for i in range(20): learner.tree.pruneVertex(startId) learner.growTree(trainX, trainY, argsortX, startId) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) if error < bestError: bestError = error bestTree = learner.tree.copy() self.assertTrue(learner.tree.depth() <= maxDepth) for vertexId in learner.tree.nonLeaves(): self.assertTrue( learner.tree.getVertex(vertexId).getTrainInds().shape[0] >= minSplit) bestError1 = bestError learner.tree = bestTree #Now we test growing a tree from a non-root vertex numpy.random.seed(21) for i in range(20): learner.tree.pruneVertex((0, 1)) learner.growTree(trainX, trainY, argsortX, (0, 1)) self.assertTrue( learner.tree.getVertex((0, )) == bestTree.getVertex((0, ))) self.assertTrue( learner.tree.getVertex((0, 0)) == bestTree.getVertex((0, 0))) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) if error < bestError: bestError = error bestTree = learner.tree.copy() #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) self.assertTrue(bestError1 >= bestError)
def testComputeAlphas(self): minSplit = 20 maxDepth = 3 gamma = 0.1 X, y = self.X, self.y testX = X[100:, :] testY = y[100:] X = X[0:100, :] y = y[0:100] learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) learner.learnModel(X, y) tree = learner.getTree() rootId = (0, ) learner.tree.getVertex(rootId).setTestInds(numpy.arange(X.shape[0])) learner.predict(X, y) learner.computeAlphas() #See if the alpha values of the nodes are correct for vertexId in tree.getAllVertexIds(): subtreeLeaves = tree.leaves(vertexId) subtreeError = 0 for subtreeLeaf in subtreeLeaves: subtreeError += ( 1 - gamma) * tree.getVertex(subtreeLeaf).getTestError() n = float(X.shape[0]) d = X.shape[1] T = tree.getNumVertices() subtreeError /= n subtreeError += gamma * numpy.sqrt(T) T2 = T - len(tree.subtreeIds(vertexId)) + 1 vertexError = (1 - gamma) * tree.getVertex(vertexId).getTestError() / n vertexError += gamma * numpy.sqrt(T2) self.assertAlmostEquals((subtreeError - vertexError), tree.getVertex(vertexId).alpha) if tree.isLeaf(vertexId): self.assertEquals(tree.getVertex(vertexId).alpha, 0.0) #Let's check the alpha of the root node via another method rootId = (0, ) T = 1 (n, d) = X.shape n = float(n) vertexError = (1 - gamma) * numpy.sum(y != Util.mode(y)) / n pen = gamma * numpy.sqrt(T) vertexError += pen T = tree.getNumVertices() treeError = (1 - gamma) * numpy.sum(y != learner.predict(X)) / n pen = gamma * numpy.sqrt(T) treeError += pen alpha = treeError - vertexError self.assertAlmostEqual(alpha, tree.getVertex(rootId).alpha)
def testGrowTree(self): startId = (0, ) minSplit = 20 maxDepth = 3 gamma = 0.01 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) #Note that this matches with the case where we create a new tree each time numpy.random.seed(21) bestError = float("inf") for i in range(20): learner.tree.pruneVertex(startId) learner.growTree(trainX, trainY, argsortX, startId) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) if error < bestError: bestError = error bestTree = learner.tree.copy() self.assertTrue(learner.tree.depth() <= maxDepth) for vertexId in learner.tree.nonLeaves(): self.assertTrue(learner.tree.getVertex(vertexId).getTrainInds().shape[0] >= minSplit) bestError1 = bestError learner.tree = bestTree #Now we test growing a tree from a non-root vertex numpy.random.seed(21) for i in range(20): learner.tree.pruneVertex((0, 1)) learner.growTree(trainX, trainY, argsortX, (0, 1)) self.assertTrue(learner.tree.getVertex((0,)) == bestTree.getVertex((0,))) self.assertTrue(learner.tree.getVertex((0,0)) == bestTree.getVertex((0,0))) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) if error < bestError: bestError = error bestTree = learner.tree.copy() #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) self.assertTrue(bestError1 >= bestError )
def testComputeAlphas(self): minSplit = 20 maxDepth = 3 gamma = 0.1 X, y = self.X, self.y testX = X[100:, :] testY = y[100:] X = X[0:100, :] y = y[0:100] learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) learner.learnModel(X, y) tree = learner.getTree() rootId = (0,) learner.tree.getVertex(rootId).setTestInds(numpy.arange(X.shape[0])) learner.predict(X, y) learner.computeAlphas() #See if the alpha values of the nodes are correct for vertexId in tree.getAllVertexIds(): subtreeLeaves = tree.leaves(vertexId) subtreeError = 0 for subtreeLeaf in subtreeLeaves: subtreeError += (1-gamma)*tree.getVertex(subtreeLeaf).getTestError() n = float(X.shape[0]) d = X.shape[1] T = tree.getNumVertices() subtreeError /= n subtreeError += gamma * numpy.sqrt(T) T2 = T - len(tree.subtreeIds(vertexId)) + 1 vertexError = (1-gamma)*tree.getVertex(vertexId).getTestError()/n vertexError += gamma * numpy.sqrt(T2) self.assertAlmostEquals((subtreeError - vertexError), tree.getVertex(vertexId).alpha) if tree.isLeaf(vertexId): self.assertEquals(tree.getVertex(vertexId).alpha, 0.0) #Let's check the alpha of the root node via another method rootId = (0,) T = 1 (n, d) = X.shape n = float(n) vertexError = (1-gamma)*numpy.sum(y != Util.mode(y))/n pen = gamma*numpy.sqrt(T) vertexError += pen T = tree.getNumVertices() treeError = (1-gamma)*numpy.sum(y != learner.predict(X))/n pen = gamma*numpy.sqrt(T) treeError += pen alpha = treeError - vertexError self.assertAlmostEqual(alpha, tree.getVertex(rootId).alpha)