class PenaltyDecisionTree(AbstractPredictor): def __init__(self, criterion="gain", maxDepth=10, minSplit=30, learnType="reg", pruning=True, gamma=0.01, sampleSize=10): """ Learn a decision tree with penalty proportional to the root of the size of the tree as in Nobel 2002. We use a stochastic approach in which we learn a set of trees randomly and choose the best one. :param criterion: The splitting criterion which is only informaiton gain currently :param maxDepth: The maximum depth of the tree :type maxDepth: `int` :param minSplit: The minimum size of a node for it to be split. :type minSplit: `int` :param type: The type of learning to perform. Currently only regression :param pruning: Whether to perform pruning or not. :type pruning: `boolean` :param gamma: The weight on the penalty factor between 0 and 1 :type gamma: `float` :param sampleSize: The number of trees to learn in the stochastic search. :type sampleSize: `int` """ super(PenaltyDecisionTree, self).__init__() self.maxDepth = maxDepth self.minSplit = minSplit self.criterion = criterion self.learnType = learnType self.setGamma(gamma) self.setSampleSize(sampleSize) self.pruning = pruning self.alphaThreshold = 0.0 def setGamma(self, gamma): Parameter.checkFloat(gamma, 0.0, 1.0) self.gamma = gamma def setSampleSize(self, sampleSize): Parameter.checkInt(sampleSize, 1, float("inf")) self.sampleSize = sampleSize def setAlphaThreshold(self, alphaThreshold): Parameter.checkFloat(alphaThreshold, -float("inf"), float("inf")) self.alphaThreshold = alphaThreshold def getAlphaThreshold(self): return self.alphaThreshold def getLeftChildId(self, nodeId): leftChildId = list(nodeId) leftChildId.append(0) leftChildId = tuple(leftChildId) return leftChildId def getRightChildId(self, nodeId): rightChildId = list(nodeId) rightChildId.append(1) rightChildId = tuple(rightChildId) return rightChildId def getTree(self): return self.tree def learnModel(self, X, y): if numpy.unique(y).shape[0] != 2: raise ValueError("Must provide binary labels") if y.dtype != numpy.int: raise ValueError("Labels must be integers") self.shapeX = X.shape argsortX = numpy.zeros(X.shape, numpy.int) for i in range(X.shape[1]): argsortX[:, i] = numpy.argsort(X[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) rootId = (0,) idStack = [rootId] self.tree = DictTree() rootNode = DecisionNode(numpy.arange(X.shape[0]), Util.mode(y)) self.tree.setVertex(rootId, rootNode) bestError = float("inf") bestTree = self.tree #First grow a selection of trees while len(idStack) != 0: #Prune the current node away and grow from that node nodeId = idStack.pop() for i in range(self.sampleSize): self.tree = bestTree.deepCopy() try: node = self.tree.getVertex(nodeId) except ValueError: print(nodeId) print(self.tree) raise self.tree.pruneVertex(nodeId) self.growTree(X, y, argsortX, nodeId) self.prune(X, y) error = self.treeObjective(X, y) if error < bestError: bestError = error bestTree = self.tree.deepCopy() children = bestTree.children(nodeId) idStack.extend(children) self.tree = bestTree def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]<bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]>=bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0 and self.tree.depth() < self.maxDepth: node.setError(1-accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId) def predict(self, X, y=None): """ Make a prediction for the set of examples given in the matrix X. If one passes in a label vector y then we set the errors for each node. On the other hand if y=None, no errors are set. """ rootId = (0,) predY = numpy.zeros(X.shape[0]) self.tree.getVertex(rootId).setTestInds(numpy.arange(X.shape[0])) idStack = [rootId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) testInds = node.getTestInds() if y!=None: node.setTestError(self.vertexTestError(y[testInds], node.getValue())) if self.tree.isLeaf(nodeId): predY[testInds] = node.getValue() else: for childId in [self.getLeftChildId(nodeId), self.getRightChildId(nodeId)]: if self.tree.vertexExists(childId): child = self.tree.getVertex(childId) if childId[-1] == 0: childInds = X[testInds, node.getFeatureInd()] < node.getThreshold() else: childInds = X[testInds, node.getFeatureInd()] >= node.getThreshold() child.setTestInds(testInds[childInds]) idStack.append(childId) return predY def treeObjective(self, X, y): """ Return the empirical risk plus penalty for the tree. """ predY = self.predict(X) (n, d) = X.shape return (1-self.gamma)*numpy.sum(predY!=y)/float(n) + self.gamma*numpy.sqrt(self.tree.getNumVertices()) def prune(self, X, y): """ Do some post pruning greedily. """ self.predict(X, y) self.computeAlphas() #Do the pruning, recomputing alpha along the way rootId = (0,) idStack = [rootId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) if node.alpha > self.alphaThreshold: self.tree.pruneVertex(nodeId) self.computeAlphas() else: for childId in [self.getLeftChildId(nodeId), self.getRightChildId(nodeId)]: if self.tree.vertexExists(childId): idStack.append(childId) def vertexTestError(self, trueY, predY): """ This is the error used for pruning. We compute it at each node. """ return numpy.sum(trueY != predY) def computeAlphas(self): """ The alpha value at each vertex is the improvement in the objective by pruning at that vertex. """ n = self.shapeX[0] for vertexId in self.tree.getAllVertexIds(): currentNode = self.tree.getVertex(vertexId) subtreeLeaves = self.tree.leaves(vertexId) subtreeError = 0 for leaf in subtreeLeaves: subtreeError += self.tree.getVertex(leaf).getTestError() T = self.tree.getNumVertices() T2 = T - len(self.tree.subtreeIds(vertexId)) + 1 currentNode.alpha = (1-self.gamma)*(subtreeError - currentNode.getTestError()) currentNode.alpha /= n currentNode.alpha += self.gamma * numpy.sqrt(T) currentNode.alpha -= self.gamma * numpy.sqrt(T2) def copy(self): """ Create a new tree with the same parameters. """ newLearner = PenaltyDecisionTree(criterion=self.criterion, maxDepth=self.maxDepth, minSplit=self.minSplit, learnType=self.learnType, pruning=self.pruning, gamma=self.gamma, sampleSize=self.sampleSize) return newLearner def getMetricMethod(self): """ Returns a way to measure the performance of the classifier. """ return Evaluator.binaryError
class DictGraphTest(unittest.TestCase): def setUp(self): self.dictTree = DictTree() self.dictTree.setVertex("a", "foo") self.dictTree.addEdge("a", "b") self.dictTree.addEdge("a", "c") self.dictTree.addEdge("b", "d") self.dictTree.addEdge("b", "e") self.dictTree.addEdge("e", "f") def testInit(self): dictTree = DictTree() def testAddEdge(self): dictTree = DictTree() dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("d", "a") #Add duplicate edge dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("d", "a") self.assertRaises(ValueError, dictTree.addEdge, "e", "a") #Add isolated edge self.assertRaises(ValueError, dictTree.addEdge, "r", "s") def testGetRoot(self): dictTree = DictTree() dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("d", "a") self.assertEquals(dictTree.getRootId(), "d") dictTree.addEdge("e", "d") self.assertEquals(dictTree.getRootId(), "e") def testSetVertex(self): dictTree = DictTree() dictTree.setVertex("a") self.assertEquals(dictTree.getVertex("a"), None) self.assertRaises(RuntimeError, dictTree.setVertex, "b") dictTree.setVertex("a", 12) self.assertEquals(dictTree.getVertex("a"), 12) def testStr(self): dictTree = DictTree() dictTree.addEdge(0, 1) dictTree.addEdge(0, 2) dictTree.addEdge(2, 3) dictTree.addEdge(2, 4) dictTree.addEdge(0, 5) dictTree.addEdge(4, 6) def testDepth(self): dictTree = DictTree() self.assertEquals(dictTree.depth(), 0) dictTree.setVertex("a") self.assertEquals(dictTree.depth(), 0) dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("d", "a") self.assertEquals(dictTree.depth(), 2) dictTree.addEdge("c", "e") self.assertEquals(dictTree.depth(), 3) def testCutTree(self): dictTree = DictTree() dictTree.setVertex("a", "foo") dictTree.addEdge("a", "b", 2) dictTree.addEdge("a", "c") dictTree.addEdge("c", "d", 5) dictTree.addEdge("c", "f") A = numpy.array([10, 2]) dictTree.setVertex("b", A) newTree = dictTree.cut(2) self.assertEquals(newTree.getVertex("a"), "foo") self.assertTrue((newTree.getVertex("b") == A).all()) self.assertEquals(newTree.getEdge("a", "b"), 2) self.assertEquals(newTree.getEdge("a", "c"), 1) self.assertEquals(newTree.getEdge("c", "d"), 5) self.assertEquals(newTree.getEdge("c", "f"), 1) self.assertEquals(newTree.getNumVertices(), dictTree.getNumVertices()) self.assertEquals(newTree.getNumEdges(), dictTree.getNumEdges()) newTree = dictTree.cut(1) self.assertEquals(newTree.getEdge("a", "b"), 2) self.assertEquals(newTree.getEdge("a", "c"), 1) self.assertEquals(newTree.getNumVertices(), 3) self.assertEquals(newTree.getNumEdges(), 2) newTree = dictTree.cut(0) self.assertEquals(newTree.getNumVertices(), 1) self.assertEquals(newTree.getNumEdges(), 0) def testLeaves(self): dictTree = DictTree() dictTree.setVertex("a", "foo") self.assertTrue(set(dictTree.leaves()) == set(["a"])) dictTree.addEdge("a", "b", 2) dictTree.addEdge("a", "c") dictTree.addEdge("c", "d", 5) dictTree.addEdge("c", "f") self.assertTrue(set(dictTree.leaves()) == set(["b", "d", "f"])) dictTree.addEdge("b", 1) dictTree.addEdge("b", 2) self.assertTrue(set(dictTree.leaves()) == set([1, 2, "d", "f"])) #Test isSubtree leaves self.assertTrue(set(dictTree.leaves("c")) == set(["d", "f"])) self.assertTrue(set(dictTree.leaves("b")) == set([1, 2])) def testAddChild(self): dictTree = DictTree() dictTree.setVertex("a", "foo") dictTree.addChild("a", "c", 2) dictTree.addChild("a", "d", 5) self.assertTrue(set(dictTree.leaves()) == set(["c", "d"])) self.assertEquals(dictTree.getVertex("c"), 2) self.assertEquals(dictTree.getVertex("d"), 5) self.assertTrue(dictTree.getEdge("a", "d"), 1.0) self.assertTrue(dictTree.getEdge("a", "c"), 1.0) def testPruneVertex(self): dictTree = DictTree() dictTree.setVertex("a", "foo") dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("b", "d") dictTree.addEdge("b", "e") dictTree.addEdge("e", "f") dictTree.pruneVertex("b") self.assertFalse(dictTree.edgeExists("b", "e")) self.assertFalse(dictTree.edgeExists("b", "d")) self.assertFalse(dictTree.edgeExists("e", "f")) self.assertTrue(dictTree.vertexExists("b")) self.assertFalse(dictTree.vertexExists("d")) self.assertFalse(dictTree.vertexExists("e")) self.assertFalse(dictTree.vertexExists("f")) dictTree.pruneVertex("a") self.assertEquals(dictTree.getNumVertices(), 1) def testIsLeaf(self): self.assertTrue(self.dictTree.isLeaf("c")) self.assertTrue(self.dictTree.isLeaf("d")) self.assertTrue(self.dictTree.isLeaf("f")) self.assertFalse(self.dictTree.isLeaf("a")) self.assertFalse(self.dictTree.isLeaf("b")) self.assertFalse(self.dictTree.isLeaf("e")) def testIsNonLeaf(self): self.assertFalse(self.dictTree.isNonLeaf("c")) self.assertFalse(self.dictTree.isNonLeaf("d")) self.assertFalse(self.dictTree.isNonLeaf("f")) self.assertTrue(self.dictTree.isNonLeaf("a")) self.assertTrue(self.dictTree.isNonLeaf("b")) self.assertTrue(self.dictTree.isNonLeaf("e")) def testCopy(self): newTree = self.dictTree.copy() newTree.addEdge("f", "x") newTree.addEdge("f", "y") self.assertEquals(newTree.getNumVertices(), self.dictTree.getNumVertices()+2) self.assertTrue(newTree.vertexExists("x")) self.assertTrue(newTree.vertexExists("y")) self.assertTrue(not self.dictTree.vertexExists("x")) self.assertTrue(not self.dictTree.vertexExists("x")) def testisSubtree(self): newTree = DictTree() newTree.addEdge("a", "b") newTree.addEdge("a", "c") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("b", "d") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("b", "e") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("e", "f") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("a", "g") self.assertFalse(newTree.isSubtree(self.dictTree)) newTree = DictTree() newTree.addEdge("b", "d") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("b", "e") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("e", "f") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("f", "g") self.assertFalse(newTree.isSubtree(self.dictTree)) newTree = DictTree() newTree.setVertex("b") self.assertTrue(newTree.isSubtree(self.dictTree)) self.assertFalse(self.dictTree.isSubtree(newTree)) self.assertTrue(self.dictTree.isSubtree(self.dictTree)) def testDeepCopy(self): class A: def __init__(self, x, y): self.x = x self.y = y a = A(1, numpy.array([1, 2])) self.dictTree.setVertex("a", a) newTree = self.dictTree.deepCopy() newTree.addEdge("f", "x") newTree.addEdge("f", "y") self.assertEquals(newTree.getNumVertices(), self.dictTree.getNumVertices()+2) self.assertTrue(newTree.vertexExists("x")) self.assertTrue(newTree.vertexExists("y")) self.assertTrue(not self.dictTree.vertexExists("x")) self.assertTrue(not self.dictTree.vertexExists("x")) self.assertEquals(self.dictTree.getVertex("a"), a) self.assertEquals(newTree.getVertex("a").x, 1) self.assertEquals(self.dictTree.getVertex("a").x, 1) a.x = 10 self.assertEquals(newTree.getVertex("a").x, 1) self.assertEquals(self.dictTree.getVertex("a").x, 10) nptst.assert_array_equal(newTree.getVertex("a").y, numpy.array([1, 2])) nptst.assert_array_equal(self.dictTree.getVertex("a").y, numpy.array([1, 2])) a.y = numpy.array([1,2,3]) nptst.assert_array_equal(newTree.getVertex("a").y, numpy.array([1, 2])) nptst.assert_array_equal(self.dictTree.getVertex("a").y, numpy.array([1, 2, 3])) def testSubtree(self): newTree = DictTree() newTree.addEdge("a", "b") newTree.addEdge("a", "c") subtree = newTree.subtreeAt("b") self.assertEquals(subtree.getAllVertexIds(), ["b"]) subtree = newTree.subtreeAt("c") self.assertEquals(subtree.getAllVertexIds(), ["c"]) subtree = newTree.subtreeAt("a") self.assertEquals(set(subtree.getAllVertexIds()), set(["a", "c", "b"]))
class PenaltyDecisionTree(AbstractPredictor): def __init__(self, criterion="gain", maxDepth=10, minSplit=30, learnType="reg", pruning=True, gamma=0.01, sampleSize=10): """ Learn a decision tree with penalty proportional to the root of the size of the tree as in Nobel 2002. We use a stochastic approach in which we learn a set of trees randomly and choose the best one. :param criterion: The splitting criterion which is only informaiton gain currently :param maxDepth: The maximum depth of the tree :type maxDepth: `int` :param minSplit: The minimum size of a node for it to be split. :type minSplit: `int` :param type: The type of learning to perform. Currently only regression :param pruning: Whether to perform pruning or not. :type pruning: `boolean` :param gamma: The weight on the penalty factor between 0 and 1 :type gamma: `float` :param sampleSize: The number of trees to learn in the stochastic search. :type sampleSize: `int` """ super(PenaltyDecisionTree, self).__init__() self.maxDepth = maxDepth self.minSplit = minSplit self.criterion = criterion self.learnType = learnType self.setGamma(gamma) self.setSampleSize(sampleSize) self.pruning = pruning self.alphaThreshold = 0.0 def setGamma(self, gamma): Parameter.checkFloat(gamma, 0.0, 1.0) self.gamma = gamma def setSampleSize(self, sampleSize): Parameter.checkInt(sampleSize, 1, float("inf")) self.sampleSize = sampleSize def setAlphaThreshold(self, alphaThreshold): Parameter.checkFloat(alphaThreshold, -float("inf"), float("inf")) self.alphaThreshold = alphaThreshold def getAlphaThreshold(self): return self.alphaThreshold def getLeftChildId(self, nodeId): leftChildId = list(nodeId) leftChildId.append(0) leftChildId = tuple(leftChildId) return leftChildId def getRightChildId(self, nodeId): rightChildId = list(nodeId) rightChildId.append(1) rightChildId = tuple(rightChildId) return rightChildId def getTree(self): return self.tree def learnModel(self, X, y): if numpy.unique(y).shape[0] != 2: raise ValueError("Must provide binary labels") if y.dtype != numpy.int: raise ValueError("Labels must be integers") self.shapeX = X.shape argsortX = numpy.zeros(X.shape, numpy.int) for i in range(X.shape[1]): argsortX[:, i] = numpy.argsort(X[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) rootId = (0, ) idStack = [rootId] self.tree = DictTree() rootNode = DecisionNode(numpy.arange(X.shape[0]), Util.mode(y)) self.tree.setVertex(rootId, rootNode) bestError = float("inf") bestTree = self.tree #First grow a selection of trees while len(idStack) != 0: #Prune the current node away and grow from that node nodeId = idStack.pop() for i in range(self.sampleSize): self.tree = bestTree.deepCopy() try: node = self.tree.getVertex(nodeId) except ValueError: print(nodeId) print(self.tree) raise self.tree.pruneVertex(nodeId) self.growTree(X, y, argsortX, nodeId) self.prune(X, y) error = self.treeObjective(X, y) if error < bestError: bestError = error bestTree = self.tree.deepCopy() children = bestTree.children(nodeId) idStack.extend(children) self.tree = bestTree def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] < bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange( nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] >= bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum( ) != 0 and self.tree.depth() < self.maxDepth: node.setError(1 - accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId) def predict(self, X, y=None): """ Make a prediction for the set of examples given in the matrix X. If one passes in a label vector y then we set the errors for each node. On the other hand if y=None, no errors are set. """ rootId = (0, ) predY = numpy.zeros(X.shape[0]) self.tree.getVertex(rootId).setTestInds(numpy.arange(X.shape[0])) idStack = [rootId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) testInds = node.getTestInds() if y != None: node.setTestError( self.vertexTestError(y[testInds], node.getValue())) if self.tree.isLeaf(nodeId): predY[testInds] = node.getValue() else: for childId in [ self.getLeftChildId(nodeId), self.getRightChildId(nodeId) ]: if self.tree.vertexExists(childId): child = self.tree.getVertex(childId) if childId[-1] == 0: childInds = X[ testInds, node.getFeatureInd()] < node.getThreshold() else: childInds = X[ testInds, node.getFeatureInd()] >= node.getThreshold() child.setTestInds(testInds[childInds]) idStack.append(childId) return predY def treeObjective(self, X, y): """ Return the empirical risk plus penalty for the tree. """ predY = self.predict(X) (n, d) = X.shape return (1 - self.gamma) * numpy.sum(predY != y) / float( n) + self.gamma * numpy.sqrt(self.tree.getNumVertices()) def prune(self, X, y): """ Do some post pruning greedily. """ self.predict(X, y) self.computeAlphas() #Do the pruning, recomputing alpha along the way rootId = (0, ) idStack = [rootId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) if node.alpha > self.alphaThreshold: self.tree.pruneVertex(nodeId) self.computeAlphas() else: for childId in [ self.getLeftChildId(nodeId), self.getRightChildId(nodeId) ]: if self.tree.vertexExists(childId): idStack.append(childId) def vertexTestError(self, trueY, predY): """ This is the error used for pruning. We compute it at each node. """ return numpy.sum(trueY != predY) def computeAlphas(self): """ The alpha value at each vertex is the improvement in the objective by pruning at that vertex. """ n = self.shapeX[0] for vertexId in self.tree.getAllVertexIds(): currentNode = self.tree.getVertex(vertexId) subtreeLeaves = self.tree.leaves(vertexId) subtreeError = 0 for leaf in subtreeLeaves: subtreeError += self.tree.getVertex(leaf).getTestError() T = self.tree.getNumVertices() T2 = T - len(self.tree.subtreeIds(vertexId)) + 1 currentNode.alpha = (1 - self.gamma) * (subtreeError - currentNode.getTestError()) currentNode.alpha /= n currentNode.alpha += self.gamma * numpy.sqrt(T) currentNode.alpha -= self.gamma * numpy.sqrt(T2) def copy(self): """ Create a new tree with the same parameters. """ newLearner = PenaltyDecisionTree(criterion=self.criterion, maxDepth=self.maxDepth, minSplit=self.minSplit, learnType=self.learnType, pruning=self.pruning, gamma=self.gamma, sampleSize=self.sampleSize) return newLearner def getMetricMethod(self): """ Returns a way to measure the performance of the classifier. """ return Evaluator.binaryError
class DictGraphTest(unittest.TestCase): def setUp(self): self.dictTree = DictTree() self.dictTree.setVertex("a", "foo") self.dictTree.addEdge("a", "b") self.dictTree.addEdge("a", "c") self.dictTree.addEdge("b", "d") self.dictTree.addEdge("b", "e") self.dictTree.addEdge("e", "f") def testInit(self): dictTree = DictTree() def testAddEdge(self): dictTree = DictTree() dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("d", "a") #Add duplicate edge dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("d", "a") self.assertRaises(ValueError, dictTree.addEdge, "e", "a") #Add isolated edge self.assertRaises(ValueError, dictTree.addEdge, "r", "s") def testGetRoot(self): dictTree = DictTree() dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("d", "a") self.assertEquals(dictTree.getRootId(), "d") dictTree.addEdge("e", "d") self.assertEquals(dictTree.getRootId(), "e") def testSetVertex(self): dictTree = DictTree() dictTree.setVertex("a") self.assertEquals(dictTree.getVertex("a"), None) self.assertRaises(RuntimeError, dictTree.setVertex, "b") dictTree.setVertex("a", 12) self.assertEquals(dictTree.getVertex("a"), 12) def testStr(self): dictTree = DictTree() dictTree.addEdge(0, 1) dictTree.addEdge(0, 2) dictTree.addEdge(2, 3) dictTree.addEdge(2, 4) dictTree.addEdge(0, 5) dictTree.addEdge(4, 6) def testDepth(self): dictTree = DictTree() self.assertEquals(dictTree.depth(), 0) dictTree.setVertex("a") self.assertEquals(dictTree.depth(), 0) dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("d", "a") self.assertEquals(dictTree.depth(), 2) dictTree.addEdge("c", "e") self.assertEquals(dictTree.depth(), 3) def testCutTree(self): dictTree = DictTree() dictTree.setVertex("a", "foo") dictTree.addEdge("a", "b", 2) dictTree.addEdge("a", "c") dictTree.addEdge("c", "d", 5) dictTree.addEdge("c", "f") A = numpy.array([10, 2]) dictTree.setVertex("b", A) newTree = dictTree.cut(2) self.assertEquals(newTree.getVertex("a"), "foo") self.assertTrue((newTree.getVertex("b") == A).all()) self.assertEquals(newTree.getEdge("a", "b"), 2) self.assertEquals(newTree.getEdge("a", "c"), 1) self.assertEquals(newTree.getEdge("c", "d"), 5) self.assertEquals(newTree.getEdge("c", "f"), 1) self.assertEquals(newTree.getNumVertices(), dictTree.getNumVertices()) self.assertEquals(newTree.getNumEdges(), dictTree.getNumEdges()) newTree = dictTree.cut(1) self.assertEquals(newTree.getEdge("a", "b"), 2) self.assertEquals(newTree.getEdge("a", "c"), 1) self.assertEquals(newTree.getNumVertices(), 3) self.assertEquals(newTree.getNumEdges(), 2) newTree = dictTree.cut(0) self.assertEquals(newTree.getNumVertices(), 1) self.assertEquals(newTree.getNumEdges(), 0) def testLeaves(self): dictTree = DictTree() dictTree.setVertex("a", "foo") self.assertTrue(set(dictTree.leaves()) == set(["a"])) dictTree.addEdge("a", "b", 2) dictTree.addEdge("a", "c") dictTree.addEdge("c", "d", 5) dictTree.addEdge("c", "f") self.assertTrue(set(dictTree.leaves()) == set(["b", "d", "f"])) dictTree.addEdge("b", 1) dictTree.addEdge("b", 2) self.assertTrue(set(dictTree.leaves()) == set([1, 2, "d", "f"])) #Test isSubtree leaves self.assertTrue(set(dictTree.leaves("c")) == set(["d", "f"])) self.assertTrue(set(dictTree.leaves("b")) == set([1, 2])) def testAddChild(self): dictTree = DictTree() dictTree.setVertex("a", "foo") dictTree.addChild("a", "c", 2) dictTree.addChild("a", "d", 5) self.assertTrue(set(dictTree.leaves()) == set(["c", "d"])) self.assertEquals(dictTree.getVertex("c"), 2) self.assertEquals(dictTree.getVertex("d"), 5) self.assertTrue(dictTree.getEdge("a", "d"), 1.0) self.assertTrue(dictTree.getEdge("a", "c"), 1.0) def testPruneVertex(self): dictTree = DictTree() dictTree.setVertex("a", "foo") dictTree.addEdge("a", "b") dictTree.addEdge("a", "c") dictTree.addEdge("b", "d") dictTree.addEdge("b", "e") dictTree.addEdge("e", "f") dictTree.pruneVertex("b") self.assertFalse(dictTree.edgeExists("b", "e")) self.assertFalse(dictTree.edgeExists("b", "d")) self.assertFalse(dictTree.edgeExists("e", "f")) self.assertTrue(dictTree.vertexExists("b")) self.assertFalse(dictTree.vertexExists("d")) self.assertFalse(dictTree.vertexExists("e")) self.assertFalse(dictTree.vertexExists("f")) dictTree.pruneVertex("a") self.assertEquals(dictTree.getNumVertices(), 1) def testIsLeaf(self): self.assertTrue(self.dictTree.isLeaf("c")) self.assertTrue(self.dictTree.isLeaf("d")) self.assertTrue(self.dictTree.isLeaf("f")) self.assertFalse(self.dictTree.isLeaf("a")) self.assertFalse(self.dictTree.isLeaf("b")) self.assertFalse(self.dictTree.isLeaf("e")) def testIsNonLeaf(self): self.assertFalse(self.dictTree.isNonLeaf("c")) self.assertFalse(self.dictTree.isNonLeaf("d")) self.assertFalse(self.dictTree.isNonLeaf("f")) self.assertTrue(self.dictTree.isNonLeaf("a")) self.assertTrue(self.dictTree.isNonLeaf("b")) self.assertTrue(self.dictTree.isNonLeaf("e")) def testCopy(self): newTree = self.dictTree.copy() newTree.addEdge("f", "x") newTree.addEdge("f", "y") self.assertEquals(newTree.getNumVertices(), self.dictTree.getNumVertices() + 2) self.assertTrue(newTree.vertexExists("x")) self.assertTrue(newTree.vertexExists("y")) self.assertTrue(not self.dictTree.vertexExists("x")) self.assertTrue(not self.dictTree.vertexExists("x")) def testisSubtree(self): newTree = DictTree() newTree.addEdge("a", "b") newTree.addEdge("a", "c") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("b", "d") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("b", "e") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("e", "f") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("a", "g") self.assertFalse(newTree.isSubtree(self.dictTree)) newTree = DictTree() newTree.addEdge("b", "d") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("b", "e") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("e", "f") self.assertTrue(newTree.isSubtree(self.dictTree)) newTree.addEdge("f", "g") self.assertFalse(newTree.isSubtree(self.dictTree)) newTree = DictTree() newTree.setVertex("b") self.assertTrue(newTree.isSubtree(self.dictTree)) self.assertFalse(self.dictTree.isSubtree(newTree)) self.assertTrue(self.dictTree.isSubtree(self.dictTree)) def testDeepCopy(self): class A: def __init__(self, x, y): self.x = x self.y = y a = A(1, numpy.array([1, 2])) self.dictTree.setVertex("a", a) newTree = self.dictTree.deepCopy() newTree.addEdge("f", "x") newTree.addEdge("f", "y") self.assertEquals(newTree.getNumVertices(), self.dictTree.getNumVertices() + 2) self.assertTrue(newTree.vertexExists("x")) self.assertTrue(newTree.vertexExists("y")) self.assertTrue(not self.dictTree.vertexExists("x")) self.assertTrue(not self.dictTree.vertexExists("x")) self.assertEquals(self.dictTree.getVertex("a"), a) self.assertEquals(newTree.getVertex("a").x, 1) self.assertEquals(self.dictTree.getVertex("a").x, 1) a.x = 10 self.assertEquals(newTree.getVertex("a").x, 1) self.assertEquals(self.dictTree.getVertex("a").x, 10) nptst.assert_array_equal(newTree.getVertex("a").y, numpy.array([1, 2])) nptst.assert_array_equal( self.dictTree.getVertex("a").y, numpy.array([1, 2])) a.y = numpy.array([1, 2, 3]) nptst.assert_array_equal(newTree.getVertex("a").y, numpy.array([1, 2])) nptst.assert_array_equal( self.dictTree.getVertex("a").y, numpy.array([1, 2, 3])) def testSubtree(self): newTree = DictTree() newTree.addEdge("a", "b") newTree.addEdge("a", "c") subtree = newTree.subtreeAt("b") self.assertEquals(subtree.getAllVertexIds(), ["b"]) subtree = newTree.subtreeAt("c") self.assertEquals(subtree.getAllVertexIds(), ["c"]) subtree = newTree.subtreeAt("a") self.assertEquals(set(subtree.getAllVertexIds()), set(["a", "c", "b"]))