def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array([0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508]) testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testSetC(self): rankSVM = RankSVM() rankSVM.setC(100.0) rankSVM.learnModel(self.X, self.y) predY = rankSVM.predict(self.X) auc1 = Evaluator.auc(predY, self.y) rankSVM.setC(0.1) rankSVM.learnModel(self.X, self.y) predY = rankSVM.predict(self.X) auc2 = Evaluator.auc(predY, self.y) self.assertTrue(auc1 != auc2)
def evaluateCvOuter(self, X, Y, folds, leafRank): """ Run cross validation and output some ROC curves. In this case Y is a 1D array. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) self.setLeafRank(leafRank) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: Util.printIteration(i, 1, folds) trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY))) logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY))) self.learnModel(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} bestMetaDicts.append(metaDict) i += 1 logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs))) logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs))) allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def testAuc(self): testY = numpy.array([-1, -1, 1, 1]) predY = numpy.array([-1, 0, 1, 1]) predY2 = numpy.array([0.1, 0.2, 0.3, 0.4]) self.assertEquals(Evaluator.auc(predY, testY), 1.0) self.assertEquals(Evaluator.auc(predY2, testY), 1.0) self.assertEquals(Evaluator.auc(-predY, testY), 0.0) numExamples = 1000 testY = numpy.array(numpy.random.rand(numExamples)>0.5, numpy.int) predY = numpy.random.rand(numExamples)>0.5 #For a random score the AUC is approximately 0.5 self.assertAlmostEquals(Evaluator.auc(predY, testY), 0.5, 1)
def testAuc(self): testY = numpy.array([-1, -1, 1, 1]) predY = numpy.array([-1, 0, 1, 1]) predY2 = numpy.array([0.1, 0.2, 0.3, 0.4]) self.assertEquals(Evaluator.auc(predY, testY), 1.0) self.assertEquals(Evaluator.auc(predY2, testY), 1.0) self.assertEquals(Evaluator.auc(-predY, testY), 0.0) numExamples = 1000 testY = numpy.array(numpy.random.rand(numExamples) > 0.5, numpy.int) predY = numpy.random.rand(numExamples) > 0.5 #For a random score the AUC is approximately 0.5 self.assertAlmostEquals(Evaluator.auc(predY, testY), 0.5, 1)
def computeRankMetrics(self, X, Y, indexList, bestLearners, standardiserY, labelIndex): #Some code to do ranking using the learner predictors i = 0 rankMetrics = numpy.zeros((len(indexList), self.boundsList[labelIndex].shape[0]-1)) for idxtr, idxts in indexList: logging.info("Iteration " + str(i)) trainX, testX = X[idxtr, :], X[idxts, :] trainY, testY = Y[idxtr], Y[idxts] bestLearners[i].learnModel(trainX, trainY) predY = bestLearners[i].predict(testX) gc.collect() #Now output 3 sets of ranked scores predY = standardiserY.unstandardiseArray(predY) testY = standardiserY.unstandardiseArray(testY) YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex]) YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex]) for j in range(self.boundsList[labelIndex].shape[0]-1): rankMetrics[i, j] = Evaluator.auc(YScores[:, j], YIndList[j]) i += 1 logging.debug(rankMetrics) return rankMetrics
def testAuc(self): self.treeRankForest.learnModel(self.X, self.Y) scores = self.treeRankForest.predictScores(self.X) auc1 = Evaluator.auc(scores, self.Y.ravel()) auc2 = self.treeRankForest.aucFromROC(self.treeRankForest.predictROC(self.X, self.Y)) self.assertAlmostEquals(auc1, auc2, places=4)
def testLocalAuc(self): testY = numpy.array([-1, -1, 1, 1, 1, 1, 1, -1, -1, 1]) predY = numpy.array([0.987, 0.868, 0.512, 0.114, 0.755, 0.976, 0.05, 0.371, 0.629, 0.819]) self.assertEquals(Evaluator.localAuc(testY, predY, 1.0), Evaluator.auc(predY, testY)) self.assertEquals(Evaluator.localAuc(testY, predY, 0.0), 0) self.assertEquals(Evaluator.localAuc(testY, testY, 0.2), 1.0)
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] y = y*2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] testY = testY*2-1 X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) numTrees = 5 minSplit = 50 maxDepths = range(3, 10) trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347]) testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRankForest = TreeRankForest(self.leafRanklearner) treeRankForest.setMaxDepth(maxDepth) treeRankForest.setMinSplit(minSplit) treeRankForest.setNumTrees(numTrees) treeRankForest.learnModel(X, y) trainScores = treeRankForest.predict(X) testScores = treeRankForest.predict(testX) print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testLocalAuc(self): testY = numpy.array([-1, -1, 1, 1, 1, 1, 1, -1, -1, 1]) predY = numpy.array([ 0.987, 0.868, 0.512, 0.114, 0.755, 0.976, 0.05, 0.371, 0.629, 0.819 ]) self.assertEquals(Evaluator.localAuc(testY, predY, 1.0), Evaluator.auc(predY, testY)) self.assertEquals(Evaluator.localAuc(testY, predY, 0.0), 0) self.assertEquals(Evaluator.localAuc(testY, testY, 0.2), 1.0)
def meanAUC(self, predY, testY, labelIndex, standardiserY): predY = standardiserY.unstandardiseArray(predY) testY = standardiserY.unstandardiseArray(testY) YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex]) YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex]) rankMetrics = numpy.zeros(self.boundsList[labelIndex].shape[0]-1) for j in range(rankMetrics.shape[0]): rankMetrics[j] = Evaluator.auc(YScores[:, j], YIndList[j]) return numpy.mean(rankMetrics)
def saveResult(self, X, Y, learner, paramDict, fileName): """ Save a single result to file, checking if the results have already been computed """ filelock = FileLock(fileName) gc.collect() if not filelock.isLocked() and not filelock.fileExists(): filelock.lock() try: logging.debug("Computing file " + fileName) logging.debug("Shape of examples: " + str(X.shape) + ", number of +1: " + str(numpy.sum(Y==1)) + ", -1: " + str(numpy.sum(Y==-1))) #idxFull = Sampling.crossValidation(self.outerFolds, X.shape[0]) idxFull = StratifiedKFold(Y, self.outerFolds) errors = numpy.zeros(self.outerFolds) for i, (trainInds, testInds) in enumerate(idxFull): logging.debug("Outer fold: " + str(i)) trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] #idx = Sampling.crossValidation(self.innerFolds, trainX.shape[0]) idx = StratifiedKFold(trainY, self.innerFolds) logging.debug("Initial learner is " + str(learner)) bestLearner, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) bestLearner = learner.getBestLearner(cvGrid, paramDict, trainX, trainY, idx, best="max") logging.debug("Best learner is " + str(bestLearner)) bestLearner.learnModel(trainX, trainY) predY = bestLearner.predict(testX) errors[i] = Evaluator.auc(predY, testY) logging.debug("Mean auc: " + str(numpy.mean(errors))) numpy.save(fileName, errors) logging.debug("Saved results as : " + fileName) finally: filelock.unlock() else: logging.debug("File exists, or is locked: " + fileName)
def learnModelCut(self, X, Y, folds=4): """ Perform model learning with tree cutting in order to choose a maximal depth. The best tree is chosen using cross validation and depths are selected from 0 to maxDepth. The best depth corresponds the maximal AUC obtained using cross validation. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param Y: A vector of binary labels as a 1D array :type Y: :class:`ndarray` :param folds: The number of cross validation folds. :type folds: :class:`int` """ indexList = cross_val.StratifiedKFold(Y, folds) depths = numpy.arange(1, self.maxDepth) meanAUCs = numpy.zeros(depths.shape[0]) for trainInds, testInds in indexList: trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] self.learnModel(trainX, trainY) fullTree = self.tree for i in range(fullTree.depth()): d = depths[i] self.tree = TreeRank.cut(fullTree, d) predTestY = self.predict(testX) meanAUCs[i] += Evaluator.auc(predTestY, testY)/float(folds) bestDepth = depths[numpy.argmax(meanAUCs)] self.learnModel(X, Y) self.tree = TreeRank.cut(self.tree, bestDepth)
def testPredict(self): rankBoost = RankBoost() rankBoost.learnModel(self.X, self.y) predY = rankBoost.predict(self.X) self.assertTrue(Evaluator.auc(predY, self.y) <= 1.0 and Evaluator.auc(predY, self.y) >= 0.0)