def splitNode(self, tree, X, Y, d, k): """ Take a node in a tree and classify in order to split it into 2 """ if self.featureSize == None: featureSize = numpy.sqrt(X.shape[1])/float(X.shape[1]) else: featureSize = self.featureSize node = tree.getVertex((d, k)) inds = node.getTrainInds() featureInds = node.getFeatureInds() alpha = numpy.sum(Y[inds]==self.bestResponse)/float(inds.shape[0]) #Now classify #We have the following condition if we need to do cross validation within the node if Util.histogram(Y[inds])[0].min() > self.minLabelCount: self.leafRanklearner.setWeight(1-alpha) leafRank = self.leafRanklearner.generateLearner(X, Y) else: leafRank = MajorityPredictor() node.setLeafRank(leafRank) leafRank.learnModel(X[inds, :][:, featureInds], Y[inds]) predY = leafRank.predict(X[inds, :][:, featureInds]) if numpy.unique(predY).shape[0] == 2 and inds.shape[0] >= self.minSplit: leftInds = inds[predY == self.bestResponse] featureInds = numpy.sort(numpy.random.permutation(X.shape[1])[0:int(numpy.round(X.shape[1]*featureSize))]) leftNode = RankNode(leftInds, featureInds) leftNode.setPure(numpy.unique(Y[leftInds]).shape[0] <= 1) leftNode.setIsLeafNode(d==self.maxDepth-1 or leftNode.isPure()) leftNode.setScore((1 - float(2*k)/2**(d+1))*2**self.maxDepth) tree.addEdge((d, k), (d+1, 2*k)) tree.setVertex((d+1, 2*k), leftNode) rightInds = inds[predY != self.bestResponse] featureInds = numpy.sort(numpy.random.permutation(X.shape[1])[0:int(numpy.round(X.shape[1]*featureSize))]) rightNode = RankNode(rightInds, featureInds) rightNode.setPure(numpy.unique(Y[rightInds]).shape[0] <= 1) rightNode.setIsLeafNode(d==self.maxDepth-1 or rightNode.isPure()) rightNode.setScore((1 - float(2*k+1)/2**(d+1))*2**self.maxDepth) tree.addEdge((d, k), (d+1, 2*k+1)) tree.setVertex((d+1, 2*k+1), rightNode) else: node.setIsLeafNode(True) node.setScore((1 - float(k)/2**d)*2**self.maxDepth) return tree
def testHistogram(self): v = numpy.array([0, 0, 1, 5, 0, 2, 2, 2, 5]) (freq, items) = Util.histogram(v) self.assertTrue((freq == numpy.array([3, 1, 3, 2])).all()) self.assertTrue((items == numpy.array([0, 1, 2, 5])).all())
egoQuestionIds = eCsvReader.getEgoQuestionIds() alterQuestionIds = eCsvReader.getAlterQuestionIds() missing = 0 (egoX, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds, missing) egoX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(egoX[:, eCsvReader.ageIndex]) (alterX, titles) = eCsvReader.readFile(alterFileName, alterQuestionIds, missing) alterX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(alterX[:, eCsvReader.ageIndex]) numFeatures = egoX.shape[1] numEgoExamples = egoX.shape[0] numAlterExamples = alterX.shape[0] for i in range(0, numFeatures): (histE, uniqElementsE) = Util.histogram(egoX[:, i]) (histA, uniqElementsA) = Util.histogram(alterX[:, i]) print((str(i) + " " + str(egoQuestionIds[i]))) print(("Ego " + str(uniqElementsE))) print(("Alter " + str(uniqElementsA))) print((numpy.setxor1d(uniqElementsE, uniqElementsA))) print((histE/numEgoExamples)) print((histA/numAlterExamples)) """ Conclusion is that the distributions are broadly the same. The problem occurs with missing data handling. For example in Ego there are values with [ 0. 8.] with most zero, and in alter [ 0. 5.]. The means will be approx 8 for ego and 5 for alter. """