예제 #1
0
    def splitNode(self, tree, X, Y, d, k):
        """
        Take a node in a tree and classify in order to split it into 2 
        """
        if self.featureSize == None: 
            featureSize = numpy.sqrt(X.shape[1])/float(X.shape[1])
        else: 
            featureSize = self.featureSize       
        
        node = tree.getVertex((d, k))
        inds = node.getTrainInds()
        featureInds = node.getFeatureInds()
        alpha =  numpy.sum(Y[inds]==self.bestResponse)/float(inds.shape[0])

        #Now classify

        #We have the following condition if we need to do cross validation within the node
        if Util.histogram(Y[inds])[0].min() > self.minLabelCount:
            self.leafRanklearner.setWeight(1-alpha)
            leafRank = self.leafRanklearner.generateLearner(X, Y)
        else:
            leafRank = MajorityPredictor()

        node.setLeafRank(leafRank)
        leafRank.learnModel(X[inds, :][:, featureInds], Y[inds])
        predY = leafRank.predict(X[inds, :][:, featureInds])
        
        if numpy.unique(predY).shape[0] == 2 and inds.shape[0] >= self.minSplit:
            leftInds = inds[predY == self.bestResponse]
            featureInds = numpy.sort(numpy.random.permutation(X.shape[1])[0:int(numpy.round(X.shape[1]*featureSize))])
            leftNode = RankNode(leftInds, featureInds)
            leftNode.setPure(numpy.unique(Y[leftInds]).shape[0] <= 1)
            leftNode.setIsLeafNode(d==self.maxDepth-1 or leftNode.isPure())
            leftNode.setScore((1 - float(2*k)/2**(d+1))*2**self.maxDepth)
            tree.addEdge((d, k), (d+1, 2*k))
            tree.setVertex((d+1, 2*k), leftNode)

            rightInds = inds[predY != self.bestResponse]
            featureInds = numpy.sort(numpy.random.permutation(X.shape[1])[0:int(numpy.round(X.shape[1]*featureSize))])
            rightNode = RankNode(rightInds, featureInds)
            rightNode.setPure(numpy.unique(Y[rightInds]).shape[0] <= 1)
            rightNode.setIsLeafNode(d==self.maxDepth-1 or rightNode.isPure())
            rightNode.setScore((1 - float(2*k+1)/2**(d+1))*2**self.maxDepth)
            tree.addEdge((d, k), (d+1, 2*k+1))
            tree.setVertex((d+1, 2*k+1), rightNode)
        else:
            node.setIsLeafNode(True)
            node.setScore((1 - float(k)/2**d)*2**self.maxDepth)
            
        return tree 
예제 #2
0
 def testHistogram(self):
     v = numpy.array([0, 0, 1, 5, 0, 2, 2, 2, 5])
     
     (freq, items) = Util.histogram(v)
     self.assertTrue((freq == numpy.array([3, 1, 3, 2])).all())
     self.assertTrue((items == numpy.array([0, 1, 2, 5])).all())
예제 #3
0
    egoQuestionIds = eCsvReader.getEgoQuestionIds()
    alterQuestionIds = eCsvReader.getAlterQuestionIds()

    missing = 0 
    (egoX, titles) = eCsvReader.readFile(egoFileName, egoQuestionIds, missing)
    egoX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(egoX[:, eCsvReader.ageIndex])

    (alterX, titles) = eCsvReader.readFile(alterFileName, alterQuestionIds, missing)
    alterX[:, eCsvReader.ageIndex] = eCsvReader.ageToCategories(alterX[:, eCsvReader.ageIndex])

    numFeatures = egoX.shape[1]
    numEgoExamples = egoX.shape[0]
    numAlterExamples = alterX.shape[0]

    for i in range(0, numFeatures):
        (histE, uniqElementsE) = Util.histogram(egoX[:, i])
        (histA, uniqElementsA) = Util.histogram(alterX[:, i])

        print((str(i) + " " + str(egoQuestionIds[i])))
        print(("Ego   " + str(uniqElementsE)))
        print(("Alter " + str(uniqElementsA)))
        print((numpy.setxor1d(uniqElementsE, uniqElementsA)))
        print((histE/numEgoExamples))
        print((histA/numAlterExamples))

    """
    Conclusion is that the distributions are broadly the same. The problem occurs
    with missing data handling. For example in Ego there are values with [ 0.  8.]
    with most zero, and in alter [ 0.  5.]. The means will be approx 8 for ego and 5 for
    alter.
    """
예제 #4
0
    def testHistogram(self):
        v = numpy.array([0, 0, 1, 5, 0, 2, 2, 2, 5])

        (freq, items) = Util.histogram(v)
        self.assertTrue((freq == numpy.array([3, 1, 3, 2])).all())
        self.assertTrue((items == numpy.array([0, 1, 2, 5])).all())