def learnModel(self, X, Y): """ Learn a model for a set of examples given as the rows of the matrix X, with corresponding labels given in the elements of 1D array Y. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param Y: A vector of binary labels as a 1D array :type Y: :class:`ndarray` """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkArray(X) Parameter.checkArray(Y) labels = numpy.unique(Y) if labels.shape[0] != 2: raise ValueError("Can only accept binary labelled data: " + str(labels)) if (labels != numpy.array([-1, 1])).any(): raise ValueError("Labels must be -1/+1: " + str(labels)) if self.featureSize == None: featureSize = numpy.sqrt(X.shape[1])/float(X.shape[1]) else: featureSize = self.featureSize tree = DictTree() trainInds = numpy.arange(Y.shape[0]) featureInds = numpy.sort(numpy.random.permutation(X.shape[1])[0:int(numpy.round(X.shape[1]*featureSize))]) #Seed the tree node = RankNode(trainInds, featureInds) tree.setVertex((0, 0), node) for d in range(self.maxDepth): for k in range(2**d): if tree.vertexExists((d, k)): node = tree.getVertex((d, k)) if not node.isPure() and not node.isLeafNode(): self.splitNode(tree, X, Y, d, k) self.tree = tree
def testSplitNode(self): d = 0 k = 0 maxDepth = 1 inds = numpy.arange(self.y.shape[0]) treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) node = RankNode(inds, numpy.arange(self.X.shape[1])) tree = DictTree() tree.setVertex((0, 0), node) tree = treeRank.splitNode(tree, self.X, self.y, d, k) self.assertEquals(tree.getNumVertices(), 3) self.assertEquals(tree.getNumEdges(), 2) self.assertEquals(tree.getRootId(), (0, 0)) self.assertTrue(not tree.getVertex((0, 0)).isLeafNode()) self.assertTrue(tree.getVertex((1, 0)).isLeafNode()) self.assertTrue(tree.getVertex((1, 1)).isLeafNode()) self.assertTrue(tree.depth() <= maxDepth)
def splitNode(self, tree, X, Y, d, k): """ Take a node in a tree and classify in order to split it into 2 """ if self.featureSize == None: featureSize = numpy.sqrt(X.shape[1])/float(X.shape[1]) else: featureSize = self.featureSize node = tree.getVertex((d, k)) inds = node.getTrainInds() featureInds = node.getFeatureInds() alpha = numpy.sum(Y[inds]==self.bestResponse)/float(inds.shape[0]) #Now classify #We have the following condition if we need to do cross validation within the node if Util.histogram(Y[inds])[0].min() > self.minLabelCount: self.leafRanklearner.setWeight(1-alpha) leafRank = self.leafRanklearner.generateLearner(X, Y) else: leafRank = MajorityPredictor() node.setLeafRank(leafRank) leafRank.learnModel(X[inds, :][:, featureInds], Y[inds]) predY = leafRank.predict(X[inds, :][:, featureInds]) if numpy.unique(predY).shape[0] == 2 and inds.shape[0] >= self.minSplit: leftInds = inds[predY == self.bestResponse] featureInds = numpy.sort(numpy.random.permutation(X.shape[1])[0:int(numpy.round(X.shape[1]*featureSize))]) leftNode = RankNode(leftInds, featureInds) leftNode.setPure(numpy.unique(Y[leftInds]).shape[0] <= 1) leftNode.setIsLeafNode(d==self.maxDepth-1 or leftNode.isPure()) leftNode.setScore((1 - float(2*k)/2**(d+1))*2**self.maxDepth) tree.addEdge((d, k), (d+1, 2*k)) tree.setVertex((d+1, 2*k), leftNode) rightInds = inds[predY != self.bestResponse] featureInds = numpy.sort(numpy.random.permutation(X.shape[1])[0:int(numpy.round(X.shape[1]*featureSize))]) rightNode = RankNode(rightInds, featureInds) rightNode.setPure(numpy.unique(Y[rightInds]).shape[0] <= 1) rightNode.setIsLeafNode(d==self.maxDepth-1 or rightNode.isPure()) rightNode.setScore((1 - float(2*k+1)/2**(d+1))*2**self.maxDepth) tree.addEdge((d, k), (d+1, 2*k+1)) tree.setVertex((d+1, 2*k+1), rightNode) else: node.setIsLeafNode(True) node.setScore((1 - float(k)/2**d)*2**self.maxDepth) return tree