Exemplo n.º 1
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508])
        testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400])
        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1 
Exemplo n.º 2
0
    def testSetC(self):
        rankSVM = RankSVM()
        rankSVM.setC(100.0)
        rankSVM.learnModel(self.X, self.y)
        predY = rankSVM.predict(self.X)
        auc1 = Evaluator.auc(predY, self.y)

        rankSVM.setC(0.1)
        rankSVM.learnModel(self.X, self.y)
        predY = rankSVM.predict(self.X)
        auc2 = Evaluator.auc(predY, self.y)

        self.assertTrue(auc1 != auc2)
Exemplo n.º 3
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Exemplo n.º 4
0
    def testAuc(self):
        testY = numpy.array([-1, -1, 1, 1])
        predY = numpy.array([-1, 0, 1, 1])
        predY2 = numpy.array([0.1, 0.2, 0.3, 0.4])

        self.assertEquals(Evaluator.auc(predY, testY), 1.0)
        self.assertEquals(Evaluator.auc(predY2, testY), 1.0)
        self.assertEquals(Evaluator.auc(-predY, testY), 0.0)

        numExamples = 1000
        testY = numpy.array(numpy.random.rand(numExamples)>0.5, numpy.int)
        predY = numpy.random.rand(numExamples)>0.5

        #For a random score the AUC is approximately 0.5 
        self.assertAlmostEquals(Evaluator.auc(predY, testY), 0.5, 1)
Exemplo n.º 5
0
    def testAuc(self):
        testY = numpy.array([-1, -1, 1, 1])
        predY = numpy.array([-1, 0, 1, 1])
        predY2 = numpy.array([0.1, 0.2, 0.3, 0.4])

        self.assertEquals(Evaluator.auc(predY, testY), 1.0)
        self.assertEquals(Evaluator.auc(predY2, testY), 1.0)
        self.assertEquals(Evaluator.auc(-predY, testY), 0.0)

        numExamples = 1000
        testY = numpy.array(numpy.random.rand(numExamples) > 0.5, numpy.int)
        predY = numpy.random.rand(numExamples) > 0.5

        #For a random score the AUC is approximately 0.5
        self.assertAlmostEquals(Evaluator.auc(predY, testY), 0.5, 1)
Exemplo n.º 6
0
    def computeRankMetrics(self, X, Y, indexList, bestLearners, standardiserY, labelIndex):
        #Some code to do ranking using the learner predictors
        i = 0
        rankMetrics = numpy.zeros((len(indexList), self.boundsList[labelIndex].shape[0]-1))
        for idxtr, idxts in indexList:
            logging.info("Iteration " + str(i))

            trainX, testX = X[idxtr, :], X[idxts, :]
            trainY, testY = Y[idxtr], Y[idxts]

            bestLearners[i].learnModel(trainX, trainY)
            predY = bestLearners[i].predict(testX)
            gc.collect()

            #Now output 3 sets of ranked scores
            predY = standardiserY.unstandardiseArray(predY)
            testY = standardiserY.unstandardiseArray(testY)

            YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex])
            YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex])

            for j in range(self.boundsList[labelIndex].shape[0]-1):
                rankMetrics[i, j] = Evaluator.auc(YScores[:, j], YIndList[j])
            i += 1

        logging.debug(rankMetrics)

        return rankMetrics
Exemplo n.º 7
0
    def testAuc(self):
        self.treeRankForest.learnModel(self.X, self.Y)
        scores = self.treeRankForest.predictScores(self.X)

        auc1 = Evaluator.auc(scores, self.Y.ravel())
        auc2 = self.treeRankForest.aucFromROC(self.treeRankForest.predictROC(self.X, self.Y))

        self.assertAlmostEquals(auc1, auc2, places=4)
Exemplo n.º 8
0
    def testLocalAuc(self):
        testY = numpy.array([-1, -1, 1, 1, 1, 1, 1, -1, -1, 1])
        predY = numpy.array([0.987,  0.868,  0.512,  0.114,  0.755,  0.976,  0.05,  0.371, 0.629,  0.819])

        self.assertEquals(Evaluator.localAuc(testY, predY, 1.0), Evaluator.auc(predY, testY))
        self.assertEquals(Evaluator.localAuc(testY, predY, 0.0), 0)

        self.assertEquals(Evaluator.localAuc(testY, testY, 0.2), 1.0)
Exemplo n.º 9
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]
        
        y = y*2 - 1 

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]
        
        testY = testY*2-1

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        numTrees = 5
        minSplit = 50 
        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347])
        testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801])

        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRankForest = TreeRankForest(self.leafRanklearner)
            treeRankForest.setMaxDepth(maxDepth)
            treeRankForest.setMinSplit(minSplit)
            treeRankForest.setNumTrees(numTrees)
            treeRankForest.learnModel(X, y)
            trainScores = treeRankForest.predict(X)
            testScores = treeRankForest.predict(testX)

            print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1
Exemplo n.º 10
0
    def testLocalAuc(self):
        testY = numpy.array([-1, -1, 1, 1, 1, 1, 1, -1, -1, 1])
        predY = numpy.array([
            0.987, 0.868, 0.512, 0.114, 0.755, 0.976, 0.05, 0.371, 0.629, 0.819
        ])

        self.assertEquals(Evaluator.localAuc(testY, predY, 1.0),
                          Evaluator.auc(predY, testY))
        self.assertEquals(Evaluator.localAuc(testY, predY, 0.0), 0)

        self.assertEquals(Evaluator.localAuc(testY, testY, 0.2), 1.0)
Exemplo n.º 11
0
    def meanAUC(self, predY, testY, labelIndex, standardiserY):
        predY = standardiserY.unstandardiseArray(predY)
        testY = standardiserY.unstandardiseArray(testY)

        YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex])
        YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex])

        rankMetrics = numpy.zeros(self.boundsList[labelIndex].shape[0]-1)

        for j in range(rankMetrics.shape[0]):
            rankMetrics[j] = Evaluator.auc(YScores[:, j], YIndList[j])

        return numpy.mean(rankMetrics)
Exemplo n.º 12
0
    def saveResult(self, X, Y, learner, paramDict, fileName):
        """
        Save a single result to file, checking if the results have already been computed
        """
        filelock = FileLock(fileName)
        gc.collect()

        if not filelock.isLocked() and not filelock.fileExists(): 
            filelock.lock()
            try: 
                logging.debug("Computing file " + fileName)
                logging.debug("Shape of examples: " + str(X.shape) + ", number of +1: " + str(numpy.sum(Y==1)) + ", -1: " + str(numpy.sum(Y==-1)))
                
                #idxFull = Sampling.crossValidation(self.outerFolds, X.shape[0])
                idxFull = StratifiedKFold(Y, self.outerFolds)
                errors = numpy.zeros(self.outerFolds)
                
                for i, (trainInds, testInds) in enumerate(idxFull): 
                    logging.debug("Outer fold: " + str(i))
                    
                    trainX, trainY = X[trainInds, :], Y[trainInds]
                    testX, testY = X[testInds, :], Y[testInds]
                    #idx = Sampling.crossValidation(self.innerFolds, trainX.shape[0])
                    idx = StratifiedKFold(trainY, self.innerFolds)
                    logging.debug("Initial learner is " + str(learner))
                    bestLearner, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)

                    bestLearner = learner.getBestLearner(cvGrid, paramDict, trainX, trainY, idx, best="max")
                    logging.debug("Best learner is " + str(bestLearner))
                    
                    bestLearner.learnModel(trainX, trainY)
                    predY = bestLearner.predict(testX)
                    errors[i] = Evaluator.auc(predY, testY)
                
                logging.debug("Mean auc: " + str(numpy.mean(errors)))
                numpy.save(fileName, errors)
                logging.debug("Saved results as : " + fileName)
            finally: 
                filelock.unlock()
        else:
            logging.debug("File exists, or is locked: " + fileName)
Exemplo n.º 13
0
    def learnModelCut(self, X, Y, folds=4):
        """
        Perform model learning with tree cutting in order to choose a maximal
        depth. The best tree is chosen using cross validation and depths are
        selected from 0 to maxDepth. The best depth corresponds the maximal
        AUC obtained using cross validation. 

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param Y: A vector of binary labels as a 1D array
        :type Y: :class:`ndarray`

        :param folds: The number of cross validation folds.
        :type folds: :class:`int`
        """

        indexList = cross_val.StratifiedKFold(Y, folds)
        depths = numpy.arange(1, self.maxDepth)
        meanAUCs = numpy.zeros(depths.shape[0])

        for trainInds, testInds in indexList:
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            fullTree = self.tree

            for i in range(fullTree.depth()):
                d = depths[i]
                self.tree = TreeRank.cut(fullTree, d)
                predTestY = self.predict(testX)

                meanAUCs[i] += Evaluator.auc(predTestY, testY)/float(folds)

        bestDepth = depths[numpy.argmax(meanAUCs)]
        self.learnModel(X, Y)
        self.tree = TreeRank.cut(self.tree, bestDepth)
Exemplo n.º 14
0
    def testPredict(self):
        rankBoost = RankBoost()
        rankBoost.learnModel(self.X, self.y)
        predY = rankBoost.predict(self.X)

        self.assertTrue(Evaluator.auc(predY, self.y) <= 1.0 and Evaluator.auc(predY, self.y) >= 0.0)