def testPredict2(self): # Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array( [0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508] ) testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400]) i = 0 # The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i += 1
def testPredict(self): rankBoost = RankBoost() rankBoost.learnModel(self.X, self.y) predY = rankBoost.predict(self.X) self.assertTrue( Evaluator.auc(predY, self.y) <= 1.0 and Evaluator.auc(predY, self.y) >= 0.0)
def evaluateCvOuter(self, X, Y, folds): """ Run cross validation and output some ROC curves. In this case Y is a 1D array. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param y: A vector of labels :type y: :class:`ndarray` :param folds: The number of cross validation folds :type folds: :class:`int` """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: Util.printIteration(i, 1, folds, "Outer CV: ") trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] self.learnModel(trainX, trainY) #self.learnModelCut(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} bestMetaDicts.append(metaDict) i += 1 logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs))) logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs))) allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def testSetC(self): rankSVM = RankSVM() rankSVM.setC(100.0) rankSVM.learnModel(self.X, self.y) predY = rankSVM.predict(self.X) auc1 = Evaluator.auc(predY, self.y) rankSVM.setC(0.1) rankSVM.learnModel(self.X, self.y) predY = rankSVM.predict(self.X) auc2 = Evaluator.auc(predY, self.y) self.assertTrue(auc1 != auc2)
def evaluateCvOuter(self, X, Y, folds, leafRank): """ Run cross validation and output some ROC curves. In this case Y is a 1D array. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) self.setLeafRank(leafRank) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: Util.printIteration(i, 1, folds) trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY))) logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY))) self.learnModel(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} bestMetaDicts.append(metaDict) i += 1 logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs))) logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs))) allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array([ 0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508 ]) testAucs = numpy.array([ 0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400 ]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i += 1
def testAuc(self): self.treeRank.learnModel(self.X, self.Y) scores = self.treeRank.predictScores(self.X) auc1 = Evaluator.auc(scores, self.Y.ravel()) auc2 = self.treeRank.aucFromROC(self.treeRank.predictROC(self.X, self.Y)) self.assertAlmostEquals(auc1, auc2, places=4)
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] y = y*2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] testY = testY*2-1 X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) numTrees = 5 minSplit = 50 maxDepths = range(3, 10) trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347]) testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRankForest = TreeRankForest(self.leafRanklearner) treeRankForest.setMaxDepth(maxDepth) treeRankForest.setMinSplit(minSplit) treeRankForest.setNumTrees(numTrees) treeRankForest.learnModel(X, y) trainScores = treeRankForest.predict(X) testScores = treeRankForest.predict(testX) print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testPredict(self): generator = SVMLeafRank(self.paramDict, self.folds) learner = generator.generateLearner(self.X, self.y) predY = learner.predict(self.X) #Seems to work auc = learner.getMetricMethod()(predY, self.y) auc2 = Evaluator.auc(predY, self.y) self.assertEquals(auc, auc2)
def learnModelCut(self, X, Y, folds=4): """ Perform model learning with tree cutting in order to choose a maximal depth. The best tree is chosen using cross validation and depths are selected from 0 to maxDepth. The best depth corresponds the maximal AUC obtained using cross validation. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param Y: A vector of binary labels as a 1D array :type Y: :class:`ndarray` :param folds: The number of cross validation folds. :type folds: :class:`int` """ indexList = cross_val.StratifiedKFold(Y, folds) depths = numpy.arange(1, self.maxDepth) meanAUCs = numpy.zeros(depths.shape[0]) for trainInds, testInds in indexList: trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] self.learnModel(trainX, trainY) fullTree = self.tree for i in range(fullTree.depth()): d = depths[i] self.tree = TreeRank.cut(fullTree, d) predTestY = self.predict(testX) meanAUCs[i] += Evaluator.auc(predTestY, testY)/float(folds) bestDepth = depths[numpy.argmax(meanAUCs)] self.learnModel(X, Y) self.tree = TreeRank.cut(self.tree, bestDepth)
def testPredict(self): rankBoost = RankBoost() rankBoost.learnModel(self.X, self.y) predY = rankBoost.predict(self.X) self.assertTrue(Evaluator.auc(predY, self.y) <= 1.0 and Evaluator.auc(predY, self.y) >= 0.0)
def evaluateCvOuter(self, X, Y, folds, leafRank, innerFolds=3): """ Run model selection and output some ROC curves. In this case Y is a 1D array. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) maxDepths = numpy.flipud(numpy.arange(1, 12, 1)) if leafRank == self.getTreeRankLib().LRforest: varSplits = numpy.arange(0.6, 1.01, 0.2) else: varSplits = numpy.array([1]) #According to Nicolas nfcv>1 doesn't help nfcvs = [1] #This is tied in with depth mincrit = 0.00 #If minsplit is too low sometimes get a node with no positive labels minSplits = numpy.array([50]) self.setLeafRank(leafRank) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] meanParamAUCs = [] paramList = [] logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY))) logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY))) for varSplit in varSplits: for nfcv in nfcvs: for minSplit in minSplits: self.setMaxDepth(maxDepths[0]) self.setVarSplit(varSplit) self.setNfcv(nfcv) self.setMinSplit(minSplit) logging.debug(self) idx = cross_val.StratifiedKFold(trainY, innerFolds) j = 0 metrics = numpy.zeros((len(idx), maxDepths.shape[0])) for idxtr, idxts in idx: Util.printIteration(j, 1, innerFolds) innerTrainX, innerTestX = trainX[idxtr, :], trainX[idxts, :] innerTrainY, innerTestY = trainY[idxtr], trainY[idxts] self.learnModel(innerTrainX, innerTrainY) for k in range(maxDepths.shape[0]): maxDepth = maxDepths[k] robjects.globalenv["maxDepth"] = maxDepth robjects.globalenv["tree"] = self.tree nodeList = robjects.r('tree$nodes[tree$depth>=maxDepth]') self.tree = self.treeRankLib.subTreeRank(self.tree, nodeList) predY = self.predict(innerTestX) gc.collect() metrics[j, k] = Evaluator.auc(predY, innerTestY) j += 1 meanAUC = numpy.mean(metrics, 0) varAUC = numpy.var(metrics, 0) logging.warn(self.baseLib.warnings()) logging.debug("Mean AUCs and variances at each depth " + str((meanAUC, varAUC))) for k in range(maxDepths.shape[0]): maxDepth = maxDepths[k] meanParamAUCs.append(meanAUC[k]) paramList.append((maxDepth, varSplit, nfcv, minSplit)) #Try to get some memory back gc.collect() robjects.r('gc(verbose=TRUE)') robjects.r('memory.profile()') #print(self.hp.heap()) #Now choose best params bestInd = numpy.argmax(numpy.array(meanParamAUCs)) self.setMaxDepth(paramList[bestInd][0]) self.setVarSplit(paramList[bestInd][1]) self.setNfcv(paramList[bestInd][2]) self.setMinSplit(paramList[bestInd][3]) self.learnModel(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestParams.append(paramList[bestInd]) bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} metaDict["size"] = self.getTreeSize() metaDict["depth"] = self.getTreeDepth() bestMetaDicts.append(metaDict) i += 1 allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)