Exemplo n.º 1
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508])
        testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400])
        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1 
Exemplo n.º 2
0
    def testLocalAuc(self):
        testY = numpy.array([-1, -1, 1, 1, 1, 1, 1, -1, -1, 1])
        predY = numpy.array([0.987,  0.868,  0.512,  0.114,  0.755,  0.976,  0.05,  0.371, 0.629,  0.819])

        self.assertEquals(Evaluator.localAuc(testY, predY, 1.0), Evaluator.auc(predY, testY))
        self.assertEquals(Evaluator.localAuc(testY, predY, 0.0), 0)

        self.assertEquals(Evaluator.localAuc(testY, testY, 0.2), 1.0)
Exemplo n.º 3
0
 def testAveragePrecisionFromLists(self): 
     predList  = [4, 2, 10]
     testList = [4, 2, 15, 16]
     
     self.assertEquals(Evaluator.averagePrecisionFromLists(testList, predList), 0.5)
     
     predList = [0,1,2,3,4,5]
     testList = [0, 3, 4, 5]
     self.assertAlmostEquals(Evaluator.averagePrecisionFromLists(testList, predList), 0.691666666666)
Exemplo n.º 4
0
    def testLocalAuc(self):
        testY = numpy.array([-1, -1, 1, 1, 1, 1, 1, -1, -1, 1])
        predY = numpy.array([
            0.987, 0.868, 0.512, 0.114, 0.755, 0.976, 0.05, 0.371, 0.629, 0.819
        ])

        self.assertEquals(Evaluator.localAuc(testY, predY, 1.0),
                          Evaluator.auc(predY, testY))
        self.assertEquals(Evaluator.localAuc(testY, predY, 0.0), 0)

        self.assertEquals(Evaluator.localAuc(testY, testY, 0.2), 1.0)
Exemplo n.º 5
0
    def testBinaryError(self):
        testY = numpy.array([1, 1, -1, 1])
        predY = numpy.array([-1, 1, -1, 1])
        predY2 = numpy.array([-1, -1, -1, 1])
        predY3 = numpy.array([-1, -1, 1, -1])

        self.assertTrue(Evaluator.binaryError(testY, predY) == 0.25)
        self.assertTrue(Evaluator.binaryError(testY, testY) == 0.0)
        self.assertTrue(Evaluator.binaryError(predY, predY) == 0.0)

        self.assertTrue(Evaluator.binaryError(testY, predY2) == 0.5)
        self.assertTrue(Evaluator.binaryError(testY, predY3) == 1.0)
Exemplo n.º 6
0
    def testMeanAbsError(self):
        testY = numpy.array([1, 2, 1.5])
        predY = numpy.array([2, 1, 0.5])

        self.assertEquals(Evaluator.meanAbsError(testY, predY), 1.0)
        self.assertEquals(Evaluator.meanAbsError(testY, testY), 0.0)

        testY = numpy.random.rand(10)
        predY = numpy.random.rand(10)

        error = numpy.abs(testY - predY).mean()
        self.assertEquals(error, Evaluator.meanAbsError(testY, predY))
Exemplo n.º 7
0
    def testBinaryError(self):
        testY = numpy.array([1, 1, -1, 1])
        predY = numpy.array([-1, 1, -1, 1])
        predY2 = numpy.array([-1, -1, -1, 1])
        predY3 = numpy.array([-1, -1, 1, -1])

        self.assertTrue(Evaluator.binaryError(testY, predY) == 0.25)
        self.assertTrue(Evaluator.binaryError(testY, testY) == 0.0)
        self.assertTrue(Evaluator.binaryError(predY, predY) == 0.0)

        self.assertTrue(Evaluator.binaryError(testY, predY2) == 0.5)
        self.assertTrue(Evaluator.binaryError(testY, predY3) == 1.0)
Exemplo n.º 8
0
    def testWeightedRootMeanSqError(self):

        y = numpy.array([0.1, 0.2, 0.3])
        predY = numpy.array([0.1, 0.2, 0.3])

        self.assertEquals(Evaluator.weightedRootMeanSqError(y, predY), 0.0)

        #Errors on larger ys are weighted more 
        predY = numpy.array([0.0, 0.2, 0.3])
        predY2 = numpy.array([0.1, 0.2, 0.4])

        self.assertTrue(Evaluator.weightedRootMeanSqError(y, predY) < Evaluator.weightedRootMeanSqError(y, predY2))
Exemplo n.º 9
0
 def testMeanAbsError(self): 
     testY = numpy.array([1, 2, 1.5])
     predY = numpy.array([2, 1, 0.5]) 
     
     self.assertEquals(Evaluator.meanAbsError(testY, predY), 1.0)
     self.assertEquals(Evaluator.meanAbsError(testY, testY), 0.0)
     
     testY = numpy.random.rand(10)
     predY = numpy.random.rand(10)
     
     error = numpy.abs(testY - predY).mean()
     self.assertEquals(error, Evaluator.meanAbsError(testY, predY))
Exemplo n.º 10
0
    def testAveragePrecisionFromLists(self):
        predList = [4, 2, 10]
        testList = [4, 2, 15, 16]

        self.assertEquals(
            Evaluator.averagePrecisionFromLists(testList, predList), 0.5)

        predList = [0, 1, 2, 3, 4, 5]
        testList = [0, 3, 4, 5]
        self.assertAlmostEquals(
            Evaluator.averagePrecisionFromLists(testList, predList),
            0.691666666666)
Exemplo n.º 11
0
    def testLearnModel2(self): 
        #We want to make sure the learnt tree with gamma = 0 maximise the 
        #empirical risk 
        minSplit = 20
        maxDepth = 3
        gamma = 0.01
        learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) 
        
        #Vary sampleSize
        numpy.random.seed(21)
        learner.setSampleSize(1)           
        learner.learnModel(self.X, self.y)        
        error1 = learner.treeObjective(self.X, self.y)

        numpy.random.seed(21)
        learner.setSampleSize(5)        
        learner.learnModel(self.X, self.y)
        error2 = learner.treeObjective(self.X, self.y)

        numpy.random.seed(21)                
        learner.setSampleSize(10)       
        learner.learnModel(self.X, self.y)
        error3 = learner.treeObjective(self.X, self.y)
        
        self.assertTrue(error1 >= error2)
        self.assertTrue(error2 >= error3)
        
        #Now vary max depth 
        learner.gamma = 0         
        
        numpy.random.seed(21)
        learner.setSampleSize(1) 
        learner.minSplit = 1
        learner.maxDepth = 3 
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error1 = Evaluator.binaryError(self.y, predY)
        
        numpy.random.seed(21)
        learner.maxDepth = 5 
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error2 = Evaluator.binaryError(self.y, predY)
        
        numpy.random.seed(21)
        learner.maxDepth = 10 
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error3 = Evaluator.binaryError(self.y, predY)        
        
        self.assertTrue(error1 >= error2)
        self.assertTrue(error2 >= error3)
Exemplo n.º 12
0
    def testSetC(self):
        rankSVM = RankSVM()
        rankSVM.setC(100.0)
        rankSVM.learnModel(self.X, self.y)
        predY = rankSVM.predict(self.X)
        auc1 = Evaluator.auc(predY, self.y)

        rankSVM.setC(0.1)
        rankSVM.learnModel(self.X, self.y)
        predY = rankSVM.predict(self.X)
        auc2 = Evaluator.auc(predY, self.y)

        self.assertTrue(auc1 != auc2)
Exemplo n.º 13
0
    def testRootMeanSqError(self):
        y = numpy.array([1,2,3])
        predY = numpy.array([1,2,3])

        self.assertEquals(Evaluator.rootMeanSqError(y, predY), 0.0)

        y = numpy.array([1,2,3])
        predY = numpy.array([1,2,2])

        self.assertEquals(Evaluator.rootMeanSqError(y, predY), float(1)/numpy.sqrt(3))

        predY = numpy.array([1,2])
        self.assertRaises(ValueError, Evaluator.rootMeanSqError, y, predY)
Exemplo n.º 14
0
    def testRootMeanSqError(self):
        y = numpy.array([1, 2, 3])
        predY = numpy.array([1, 2, 3])

        self.assertEquals(Evaluator.rootMeanSqError(y, predY), 0.0)

        y = numpy.array([1, 2, 3])
        predY = numpy.array([1, 2, 2])

        self.assertEquals(Evaluator.rootMeanSqError(y, predY),
                          float(1) / numpy.sqrt(3))

        predY = numpy.array([1, 2])
        self.assertRaises(ValueError, Evaluator.rootMeanSqError, y, predY)
Exemplo n.º 15
0
    def testWeightedRootMeanSqError(self):

        y = numpy.array([0.1, 0.2, 0.3])
        predY = numpy.array([0.1, 0.2, 0.3])

        self.assertEquals(Evaluator.weightedRootMeanSqError(y, predY), 0.0)

        #Errors on larger ys are weighted more
        predY = numpy.array([0.0, 0.2, 0.3])
        predY2 = numpy.array([0.1, 0.2, 0.4])

        self.assertTrue(
            Evaluator.weightedRootMeanSqError(y, predY) <
            Evaluator.weightedRootMeanSqError(y, predY2))
Exemplo n.º 16
0
    def testBinaryBootstrapError(self):

        testY = numpy.array([-1, -1, 1, 1, 1])
        predY = 1 - testY

        trainY = numpy.array([-1, -1, 1, 1, 1])
        predTrainY = 1 - trainY

        self.assertEquals(Evaluator.binaryBootstrapError(testY, testY, trainY, trainY, 0.5), 0.0)

        self.assertEquals(Evaluator.binaryBootstrapError(testY, testY, trainY, predTrainY, 0.5), 0.5)
        self.assertEquals(Evaluator.binaryBootstrapError(testY, testY, trainY, predTrainY, 0.1), 0.9)

        self.assertEquals(Evaluator.binaryBootstrapError(testY, predY, trainY, trainY, 0.1), 0.1)
Exemplo n.º 17
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Exemplo n.º 18
0
    def testAuc(self):
        testY = numpy.array([-1, -1, 1, 1])
        predY = numpy.array([-1, 0, 1, 1])
        predY2 = numpy.array([0.1, 0.2, 0.3, 0.4])

        self.assertEquals(Evaluator.auc(predY, testY), 1.0)
        self.assertEquals(Evaluator.auc(predY2, testY), 1.0)
        self.assertEquals(Evaluator.auc(-predY, testY), 0.0)

        numExamples = 1000
        testY = numpy.array(numpy.random.rand(numExamples)>0.5, numpy.int)
        predY = numpy.random.rand(numExamples)>0.5

        #For a random score the AUC is approximately 0.5 
        self.assertAlmostEquals(Evaluator.auc(predY, testY), 0.5, 1)
Exemplo n.º 19
0
    def testAuc(self):
        testY = numpy.array([-1, -1, 1, 1])
        predY = numpy.array([-1, 0, 1, 1])
        predY2 = numpy.array([0.1, 0.2, 0.3, 0.4])

        self.assertEquals(Evaluator.auc(predY, testY), 1.0)
        self.assertEquals(Evaluator.auc(predY2, testY), 1.0)
        self.assertEquals(Evaluator.auc(-predY, testY), 0.0)

        numExamples = 1000
        testY = numpy.array(numpy.random.rand(numExamples) > 0.5, numpy.int)
        predY = numpy.random.rand(numExamples) > 0.5

        #For a random score the AUC is approximately 0.5
        self.assertAlmostEquals(Evaluator.auc(predY, testY), 0.5, 1)
Exemplo n.º 20
0
    def computeRankMetrics(self, X, Y, indexList, bestLearners, standardiserY, labelIndex):
        #Some code to do ranking using the learner predictors
        i = 0
        rankMetrics = numpy.zeros((len(indexList), self.boundsList[labelIndex].shape[0]-1))
        for idxtr, idxts in indexList:
            logging.info("Iteration " + str(i))

            trainX, testX = X[idxtr, :], X[idxts, :]
            trainY, testY = Y[idxtr], Y[idxts]

            bestLearners[i].learnModel(trainX, trainY)
            predY = bestLearners[i].predict(testX)
            gc.collect()

            #Now output 3 sets of ranked scores
            predY = standardiserY.unstandardiseArray(predY)
            testY = standardiserY.unstandardiseArray(testY)

            YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex])
            YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex])

            for j in range(self.boundsList[labelIndex].shape[0]-1):
                rankMetrics[i, j] = Evaluator.auc(YScores[:, j], YIndList[j])
            i += 1

        logging.debug(rankMetrics)

        return rankMetrics
Exemplo n.º 21
0
    def testAuc(self):
        self.treeRankForest.learnModel(self.X, self.Y)
        scores = self.treeRankForest.predictScores(self.X)

        auc1 = Evaluator.auc(scores, self.Y.ravel())
        auc2 = self.treeRankForest.aucFromROC(self.treeRankForest.predictROC(self.X, self.Y))

        self.assertAlmostEquals(auc1, auc2, places=4)
Exemplo n.º 22
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]
        
        y = y*2 - 1 

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]
        
        testY = testY*2-1

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        numTrees = 5
        minSplit = 50 
        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347])
        testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801])

        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRankForest = TreeRankForest(self.leafRanklearner)
            treeRankForest.setMaxDepth(maxDepth)
            treeRankForest.setMinSplit(minSplit)
            treeRankForest.setNumTrees(numTrees)
            treeRankForest.learnModel(X, y)
            trainScores = treeRankForest.predict(X)
            testScores = treeRankForest.predict(testX)

            print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1
Exemplo n.º 23
0
    def testPrecisionFromIndLists(self): 
        predList  = [4, 2, 10]
        testList = [4, 2]

        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 2.0/3)  
        
        testList = [4, 2, 10]
        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1) 
        
        predList  = [10, 2, 4]
        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1)
        
        testList = [1, 9, 11]
        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 0)
        
        predList = [1, 2, 3, 4, 5]
        testList = [1, 9, 11]
        
        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1.0/5)
Exemplo n.º 24
0
def computeBootstrapError(args):
    """
    Used in conjunction with the parallel model selection. Trains and then tests
    on a seperate test set and evaluated the bootstrap error. 
    """
    (trainX, trainY, testX, testY, learner) = args
    learner.learnModel(trainX, trainY)
    predTestY = learner.predict(testX)
    predTrainY = learner.predict(trainX)
    weight = 0.632
    return Evaluator.binaryBootstrapError(predTestY, testY, predTrainY, trainY, weight)
Exemplo n.º 25
0
 def testCvPrune(self): 
     numExamples = 500
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().standardiseArray(y)
     
     numTrain = numpy.round(numExamples * 0.33)     
     numValid = numpy.round(numExamples * 0.33) 
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     validX = X[numTrain:numTrain+numValid, :]
     validY = y[numTrain:numTrain+numValid]
     testX = X[numTrain+numValid:, :]
     testY = y[numTrain+numValid:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     #print(learner.getTree())
     unprunedTree = learner.tree.copy() 
     learner.setGamma(1000)
     learner.cvPrune(trainX, trainY)
     
     self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices())
     learner.setGamma(100)
     learner.cvPrune(trainX, trainY)
     
     #Test if pruned tree is subtree of current: 
     for vertexId in learner.tree.getAllVertexIds(): 
         self.assertTrue(vertexId in unprunedTree.getAllVertexIds())
         
     #The error should be better after pruning 
     learner.learnModel(trainX, trainY)
     #learner.cvPrune(validX, validY, 0.0, 5)
     learner.repPrune(validX, validY)
   
     error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     self.assertTrue(error1 >= error2)
Exemplo n.º 26
0
    def meanAUC(self, predY, testY, labelIndex, standardiserY):
        predY = standardiserY.unstandardiseArray(predY)
        testY = standardiserY.unstandardiseArray(testY)

        YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex])
        YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex])

        rankMetrics = numpy.zeros(self.boundsList[labelIndex].shape[0]-1)

        for j in range(rankMetrics.shape[0]):
            rankMetrics[j] = Evaluator.auc(YScores[:, j], YIndList[j])

        return numpy.mean(rankMetrics)
Exemplo n.º 27
0
    def testBinaryBootstrapError(self):

        testY = numpy.array([-1, -1, 1, 1, 1])
        predY = 1 - testY

        trainY = numpy.array([-1, -1, 1, 1, 1])
        predTrainY = 1 - trainY

        self.assertEquals(
            Evaluator.binaryBootstrapError(testY, testY, trainY, trainY, 0.5),
            0.0)

        self.assertEquals(
            Evaluator.binaryBootstrapError(testY, testY, trainY, predTrainY,
                                           0.5), 0.5)
        self.assertEquals(
            Evaluator.binaryBootstrapError(testY, testY, trainY, predTrainY,
                                           0.1), 0.9)

        self.assertEquals(
            Evaluator.binaryBootstrapError(testY, predY, trainY, trainY, 0.1),
            0.1)
Exemplo n.º 28
0
    def testBayesError(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        gridX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        Cs = 2**numpy.arange(-5, 5, dtype=numpy.float)
        gammas = 2**numpy.arange(-5, 5, dtype=numpy.float)

        bestError = 1 

        for C in Cs:
            for gamma in gammas:
                svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma)
                svm.learnModel(trainX, trainY)
                predY, decisionsY = svm.predict(gridX, True)
                decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
                error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)

                predY, decisionsY = svm.predict(testX, True)
                error2 = Evaluator.binaryError(testY, predY)
                print(error, error2)

                if error < bestError:
                    error = bestError
                    bestC = C
                    bestGamma = gamma

        svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma)
        svm.learnModel(trainX, trainY)
        predY, decisionsY = svm.predict(gridX, True)

        plt.figure(0)
        plt.contourf(gridPoints, gridPoints, decisionGrid, 100)
        plt.colorbar()

        plt.figure(1)
        plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1")
        plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1")
        plt.legend()
        plt.show()
Exemplo n.º 29
0
def computeIdealPenalty(args):
    """
    Find the complete penalty.
    """
    (X, y, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X) = args

    svm = LibSVM('gaussian', gamma, C)
    svm.learnModel(X, y)
    predY = svm.predict(X)
    predFullY, decisionsY = svm.predict(fullX, True)
    decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
    trueError = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)
    idealPenalty = trueError - Evaluator.binaryError(predY, y)

    return idealPenalty
Exemplo n.º 30
0
    def testPrecisionFromIndLists(self):
        predList = [4, 2, 10]
        testList = [4, 2]

        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList),
                          2.0 / 3)

        testList = [4, 2, 10]
        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList),
                          1)

        predList = [10, 2, 4]
        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList),
                          1)

        testList = [1, 9, 11]
        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList),
                          0)

        predList = [1, 2, 3, 4, 5]
        testList = [1, 9, 11]

        self.assertEquals(Evaluator.precisionFromIndLists(testList, predList),
                          1.0 / 5)
Exemplo n.º 31
0
    def saveResult(self, X, Y, learner, paramDict, fileName):
        """
        Save a single result to file, checking if the results have already been computed
        """
        filelock = FileLock(fileName)
        gc.collect()

        if not filelock.isLocked() and not filelock.fileExists(): 
            filelock.lock()
            try: 
                logging.debug("Computing file " + fileName)
                logging.debug("Shape of examples: " + str(X.shape) + ", number of +1: " + str(numpy.sum(Y==1)) + ", -1: " + str(numpy.sum(Y==-1)))
                
                #idxFull = Sampling.crossValidation(self.outerFolds, X.shape[0])
                idxFull = StratifiedKFold(Y, self.outerFolds)
                errors = numpy.zeros(self.outerFolds)
                
                for i, (trainInds, testInds) in enumerate(idxFull): 
                    logging.debug("Outer fold: " + str(i))
                    
                    trainX, trainY = X[trainInds, :], Y[trainInds]
                    testX, testY = X[testInds, :], Y[testInds]
                    #idx = Sampling.crossValidation(self.innerFolds, trainX.shape[0])
                    idx = StratifiedKFold(trainY, self.innerFolds)
                    logging.debug("Initial learner is " + str(learner))
                    bestLearner, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)

                    bestLearner = learner.getBestLearner(cvGrid, paramDict, trainX, trainY, idx, best="max")
                    logging.debug("Best learner is " + str(bestLearner))
                    
                    bestLearner.learnModel(trainX, trainY)
                    predY = bestLearner.predict(testX)
                    errors[i] = Evaluator.auc(predY, testY)
                
                logging.debug("Mean auc: " + str(numpy.mean(errors)))
                numpy.save(fileName, errors)
                logging.debug("Saved results as : " + fileName)
            finally: 
                filelock.unlock()
        else:
            logging.debug("File exists, or is locked: " + fileName)
Exemplo n.º 32
0
    def learnModelCut(self, X, Y, folds=4):
        """
        Perform model learning with tree cutting in order to choose a maximal
        depth. The best tree is chosen using cross validation and depths are
        selected from 0 to maxDepth. The best depth corresponds the maximal
        AUC obtained using cross validation. 

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param Y: A vector of binary labels as a 1D array
        :type Y: :class:`ndarray`

        :param folds: The number of cross validation folds.
        :type folds: :class:`int`
        """

        indexList = cross_val.StratifiedKFold(Y, folds)
        depths = numpy.arange(1, self.maxDepth)
        meanAUCs = numpy.zeros(depths.shape[0])

        for trainInds, testInds in indexList:
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            fullTree = self.tree

            for i in range(fullTree.depth()):
                d = depths[i]
                self.tree = TreeRank.cut(fullTree, d)
                predTestY = self.predict(testX)

                meanAUCs[i] += Evaluator.auc(predTestY, testY)/float(folds)

        bestDepth = depths[numpy.argmax(meanAUCs)]
        self.learnModel(X, Y)
        self.tree = TreeRank.cut(self.tree, bestDepth)
Exemplo n.º 33
0
    def greedyMC2(lists, itemList, trainList, n): 
        """
        A method to greedily select a subset of the outputLists such that 
        the average precision is maximised
        """
        currentListsInds = range(len(lists))
        newListsInds = []
        currentAvPrecision = 0 
        lastAvPrecision = -0.1
        
        while currentAvPrecision - lastAvPrecision > 0: 
            lastAvPrecision = currentAvPrecision 
            averagePrecisions = numpy.zeros(len(currentListsInds))      
            
            for i, j in enumerate(currentListsInds):
                newListsInds.append(j)

                newLists = []                
                for k in newListsInds: 
                    newLists.append(lists[k])
                
                rankAggregate, scores = RankAggregator.MC2(newLists, itemList)
                averagePrecisions[i] = Evaluator.averagePrecisionFromLists(trainList, rankAggregate[0:n], n)
                newListsInds.remove(j)

            j = numpy.argmax(averagePrecisions)
            currentAvPrecision = averagePrecisions[j]
            
            if currentAvPrecision > lastAvPrecision: 
                newListsInds.append(currentListsInds.pop(j))
            
        return newListsInds 
            
                
        
        
Exemplo n.º 34
0
#Figure out why the penalty is increasing 
X = trainX 
y = trainY 

for i in range(foldsSet.shape[0]): 
    folds = foldsSet[i]
    idx = Sampling.crossValidation(folds, validX.shape[0])
    
    penalty = 0
    fullError = 0 
    trainError = 0     
    
    learner.learnModel(validX, validY)
    predY = learner.predict(X)
    predValidY = learner.predict(validX)
    idealPenalty = Evaluator.rootMeanSqError(predY, y) - Evaluator.rootMeanSqError(predValidY, validY)
    
    for trainInds, testInds in idx:
        trainX = validX[trainInds, :]
        trainY = validY[trainInds]
    
        #learner.setGamma(gamma)
        #learner.setC(C)
        learner.learnModel(trainX, trainY)
        predY = learner.predict(validX)
        predTrainY = learner.predict(trainX)
        fullError += Evaluator.rootMeanSqError(predY, validY)
        trainError += Evaluator.rootMeanSqError(predTrainY, trainY)
        penalty += Evaluator.rootMeanSqError(predY, validY) - Evaluator.rootMeanSqError(predTrainY, trainY)
        
    print((folds-1)*fullError/folds, (folds-1)*trainError/folds, (folds-1)*penalty/folds)
tau = 1.0
lmbda = 0.1
linearKernel = LinearKernel()
permutationKernel = PermutationGraphKernel(tau, linearKernel)
randomWalkKernel = RandWalkGraphKernel(lmbda)

K1 = numpy.zeros((numGraphs, numGraphs))
K2 = numpy.zeros((numGraphs, numGraphs))

for i in range(0, numGraphs):
    print(("i="+str(i)))
    for j in range(0, numGraphs):
        print(("j="+str(j)))
        K1[i, j] = permutationKernel.evaluate(graphs[i], graphs[j])
        K2[i, j] = randomWalkKernel.evaluate(graphs[i], graphs[j])

D1 = KernelUtils.computeDistanceMatrix(K1)
D2 = KernelUtils.computeDistanceMatrix(K2)

numPairs = numGraphs/2
windowSize = 3
pairIndices = numpy.array([list(range(numPairs)),  list(range(numPairs))]).T
pairIndices[:, 1] = numPairs + pairIndices[:, 1]

error1 = Evaluator.evaluateWindowError(D1, windowSize, pairIndices)
error2 = Evaluator.evaluateWindowError(D2, windowSize, pairIndices)

print(("Error 1: " + str(error1)))
print(("Error 2: " + str(error2)))
Exemplo n.º 36
0
    def testPredict(self):
        rankBoost = RankBoost()
        rankBoost.learnModel(self.X, self.y)
        predY = rankBoost.predict(self.X)

        self.assertTrue(Evaluator.auc(predY, self.y) <= 1.0 and Evaluator.auc(predY, self.y) >= 0.0)
Exemplo n.º 37
0
 def testGrowTree(self):
     startId = (0, )
     minSplit = 20
     maxDepth = 3
     gamma = 0.01
     learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) 
     
     trainX = self.X[100:, :]
     trainY = self.y[100:]
     testX = self.X[0:100, :]
     testY = self.y[0:100]    
     
     argsortX = numpy.zeros(trainX.shape, numpy.int)
     for i in range(trainX.shape[1]): 
         argsortX[:, i] = numpy.argsort(trainX[:, i])
         argsortX[:, i] = numpy.argsort(argsortX[:, i])
     
     learner.tree = DictTree()
     rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY))
     learner.tree.setVertex(startId, rootNode)        
     
     #Note that this matches with the case where we create a new tree each time 
     numpy.random.seed(21)
     bestError = float("inf")        
     
     for i in range(20): 
         learner.tree.pruneVertex(startId)
         learner.growTree(trainX, trainY, argsortX, startId)
         
         predTestY = learner.predict(testX)
         error = Evaluator.binaryError(predTestY, testY)
         #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices())
         
         if error < bestError: 
             bestError = error 
             bestTree = learner.tree.copy() 
         
         self.assertTrue(learner.tree.depth() <= maxDepth)
         
         for vertexId in learner.tree.nonLeaves(): 
             self.assertTrue(learner.tree.getVertex(vertexId).getTrainInds().shape[0] >= minSplit)
     
     bestError1 = bestError               
     learner.tree = bestTree    
     
     #Now we test growing a tree from a non-root vertex 
     numpy.random.seed(21)
     for i in range(20): 
         learner.tree.pruneVertex((0, 1)) 
         learner.growTree(trainX, trainY, argsortX, (0, 1))
         
         self.assertTrue(learner.tree.getVertex((0,)) == bestTree.getVertex((0,)))
         self.assertTrue(learner.tree.getVertex((0,0)) == bestTree.getVertex((0,0)))
         
         
         predTestY = learner.predict(testX)
         error = Evaluator.binaryError(predTestY, testY)
         
         if error < bestError: 
             bestError = error 
             bestTree = learner.tree.copy() 
         #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices())
     self.assertTrue(bestError1 >= bestError )
Exemplo n.º 38
0
        minAlpha = alpha 
    if alpha > maxAlpha: 
        maxAlpha = alpha 
        
numAlphas = 100
alphas = numpy.linspace(maxAlpha+0.1, minAlpha, numAlphas)
errors = numpy.zeros(numAlphas)

for i in range(alphas.shape[0]): 
    #learner.learnModel(trainX, trainY)
    learner.setAlphaThreshold(alphas[i])
    learner.cvPrune(trainX, trainY)
    #learner.cvPrune(validX, validY, alphas[numpy.argmin(errors)])
    #learner.prune(validX, validY, alphas[i])
    predY = learner.predict(testX)
    errors[i] = Evaluator.rootMeanSqError(predY, testY)
    
plt.figure(3)
plt.scatter(alphas, errors)

#Now plot best tree 
plt.figure(4)
learner.learnModel(trainX, trainY)
#learner.cvPrune(validX, validY, alphas[numpy.argmin(errors)])
learner.setAlphaThreshold(alphas[numpy.argmin(errors)])
learner.cvPrune(trainX, trainY)
rootId = learner.tree.getRootId()
displayTree(learner, rootId, 0, 1, 0, 1, colormap)

plt.show()
    
Exemplo n.º 39
0
    def testModelSelect(self): 
        
        """
        We test the results on some data and compare to SVR. 
        """
        numExamples = 200
        X, y = data.make_regression(numExamples, noise=0.5)  
        
        X = Standardiser().standardiseArray(X)
        y = Standardiser().standardiseArray(y)
        
        trainX = X[0:100, :]
        trainY = y[0:100]
        testX = X[100:, :]
        testY = y[100:]
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV")
        learner.setPruneCV(8)
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) 
        paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int)
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
        
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART")
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) 
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
              
        return 
        #Let's compare to the SVM 
        learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") 
        
        paramDict = {} 
        paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float)
        paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float)
        paramDict["setEpsilon"] = learner2.getEpsilons()
        
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict)

        predY = bestSVM.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)