def testParallelPenaltyGridRbf(self): svm = self.svm svm.setKernel("gaussian") trainX = self.X[0:40, :] trainY = self.y[0:40] idealPenalties = svm.parallelPenaltyGridRbf(trainX, trainY, self.X, self.y) idealPenalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) idealPenalties3 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(self.X) predTrainY = svm.predict(trainX) penalty = Evaluator.binaryError( predY, self.y) - Evaluator.binaryError(predTrainY, trainY) idealPenalties2[i, j] = penalty args = (trainX, trainY, self.X, self.y, svm) idealPenalties3[i, j] = computeIdealPenalty(args) tol = 10**-6 self.assertTrue( numpy.linalg.norm(idealPenalties2.T - idealPenalties) < tol)
def testParallelPenaltyGrid(self): svm = self.svm svm.setKernel("gaussian") trainX = self.X[0:40, :] trainY = self.y[0:40] paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() idealPenalties = svm.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict) idealPenalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) idealPenalties3 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(self.X) predTrainY = svm.predict(trainX) penalty = Evaluator.binaryError(predY, self.y) - Evaluator.binaryError(predTrainY, trainY) idealPenalties2[i, j] = penalty args = (trainX, trainY, self.X, self.y, svm) idealPenalties3[i, j] = computeIdealPenalty(args) tol = 10**-6 self.assertTrue(numpy.linalg.norm(idealPenalties2.T - idealPenalties) < tol)
def testParallelPen(self): folds = 3 Cv = numpy.array([4.0]) idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv) tol = 10**-6 bestError = 1 trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] penalty = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(self.X) predTrainY = svm.predict(trainX) penalty += Evaluator.binaryError( predY, self.y) - Evaluator.binaryError( predTrainY, trainY) penalty = penalty * Cv[0] / len(idx) svm.learnModel(self.X, self.y) predY = svm.predict(self.X) trainErrors2[i, j] = Evaluator.binaryError(predY, self.y) penalties2[i, j] = penalty meanErrors2[i, j] = Evaluator.binaryError(predY, self.y) + penalty if meanErrors2[i, j] < bestError: bestC = C bestGamma = gamma bestError = meanErrors2[i, j] bestSVM, trainErrors, currentPenalties = resultsList[0] meanErrors = trainErrors + currentPenalties self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol) self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol) self.assertTrue( numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
def testParallelPen(self): folds = 3 Cv = numpy.array([4.0]) idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv) tol = 10**-6 bestError = 1 trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] penalty = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(self.X) predTrainY = svm.predict(trainX) penalty += Evaluator.binaryError(predY, self.y) - Evaluator.binaryError(predTrainY, trainY) penalty = penalty*Cv[0]/len(idx) svm.learnModel(self.X, self.y) predY = svm.predict(self.X) trainErrors2[i, j] = Evaluator.binaryError(predY, self.y) penalties2[i, j] = penalty meanErrors2[i, j] = Evaluator.binaryError(predY, self.y) + penalty if meanErrors2[i, j] < bestError: bestC = C bestGamma = gamma bestError = meanErrors2[i, j] bestSVM, trainErrors, currentPenalties = resultsList[0] meanErrors = trainErrors + currentPenalties self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol) self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol) self.assertTrue(numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
def testLearnModel2(self): #We want to make sure the learnt tree with gamma = 0 maximise the #empirical risk minSplit = 20 maxDepth = 3 gamma = 0.01 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) #Vary sampleSize numpy.random.seed(21) learner.setSampleSize(1) learner.learnModel(self.X, self.y) error1 = learner.treeObjective(self.X, self.y) numpy.random.seed(21) learner.setSampleSize(5) learner.learnModel(self.X, self.y) error2 = learner.treeObjective(self.X, self.y) numpy.random.seed(21) learner.setSampleSize(10) learner.learnModel(self.X, self.y) error3 = learner.treeObjective(self.X, self.y) self.assertTrue(error1 >= error2) self.assertTrue(error2 >= error3) #Now vary max depth learner.gamma = 0 numpy.random.seed(21) learner.setSampleSize(1) learner.minSplit = 1 learner.maxDepth = 3 learner.learnModel(self.X, self.y) predY = learner.predict(self.X) error1 = Evaluator.binaryError(self.y, predY) numpy.random.seed(21) learner.maxDepth = 5 learner.learnModel(self.X, self.y) predY = learner.predict(self.X) error2 = Evaluator.binaryError(self.y, predY) numpy.random.seed(21) learner.maxDepth = 10 learner.learnModel(self.X, self.y) predY = learner.predict(self.X) error3 = Evaluator.binaryError(self.y, predY) self.assertTrue(error1 >= error2) self.assertTrue(error2 >= error3)
def testClassify(self): try: import sklearn except ImportError as error: return self.svm.learnModel(self.X, self.y) predY = self.svm.classify(self.X) y = self.y e = Evaluator.binaryError(y, predY) #Now, permute examples perm = numpy.random.permutation(self.X.shape[0]) predY = self.svm.classify(self.X[perm, :]) y = y[perm] e2 = Evaluator.binaryError(y, predY) self.assertEquals(e, e2)
def testBayesError(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points gridX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] Cs = 2**numpy.arange(-5, 5, dtype=numpy.float) gammas = 2**numpy.arange(-5, 5, dtype=numpy.float) bestError = 1 for C in Cs: for gamma in gammas: svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) predY, decisionsY = svm.predict(testX, True) error2 = Evaluator.binaryError(testY, predY) print(error, error2) if error < bestError: error = bestError bestC = C bestGamma = gamma svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) plt.figure(0) plt.contourf(gridPoints, gridPoints, decisionGrid, 100) plt.colorbar() plt.figure(1) plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1") plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1") plt.legend() plt.show()
def computeIdealPenalty(args): """ Find the complete penalty. """ (X, y, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X) = args svm = LibSVM('gaussian', gamma, C) svm.learnModel(X, y) predY = svm.predict(X) predFullY, decisionsY = svm.predict(fullX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") trueError = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) idealPenalty = trueError - Evaluator.binaryError(predY, y) return idealPenalty
def testComputeTestError(self): C = 10.0 gamma = 0.5 numTrainExamples = self.X.shape[0]*0.5 trainX, trainY = self.X[0:numTrainExamples, :], self.y[0:numTrainExamples] testX, testY = self.X[numTrainExamples:, :], self.y[numTrainExamples:] svm = LibSVM('gaussian', gamma, C) args = (trainX, trainY, testX, testY, svm) error = computeTestError(args) svm = LibSVM('gaussian', gamma, C) svm.learnModel(trainX, trainY) predY = svm.predict(testX) self.assertEquals(Evaluator.binaryError(predY, testY), error)
def testParallelModelSelect(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx, paramDict) tol = 10**-6 bestError = 1 meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) print("Computing real grid") for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] error = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] testX = self.X[testInds, :] testY = self.y[testInds] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(testX) error += Evaluator.binaryError(predY, testY) meanErrors2[i, j] = error / len(idx) if error < bestError: bestC = C bestGamma = gamma bestError = error self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
def testComputeTestError(self): C = 10.0 gamma = 0.5 numTrainExamples = self.X.shape[0] * 0.5 trainX, trainY = self.X[ 0:numTrainExamples, :], self.y[0:numTrainExamples] testX, testY = self.X[numTrainExamples:, :], self.y[numTrainExamples:] svm = LibSVM('gaussian', gamma, C) args = (trainX, trainY, testX, testY, svm) error = computeTestError(args) svm = LibSVM('gaussian', gamma, C) svm.learnModel(trainX, trainY) predY = svm.predict(testX) self.assertEquals(Evaluator.binaryError(predY, testY), error)
def testSetSvmType(self): try: import sklearn except ImportError as error: return numExamples = 100 numFeatures = 10 X = numpy.random.randn(numExamples, numFeatures) X = Standardiser().standardiseArray(X) c = numpy.random.randn(numFeatures) y = numpy.dot(X, numpy.array([c]).T).ravel() + 1 y2 = numpy.array(y > 0, numpy.int32) * 2 - 1 svm = LibSVM() svm.setSvmType("Epsilon_SVR") self.assertEquals(svm.getType(), "Epsilon_SVR") #Try to get a good error Cs = 2**numpy.arange(-6, 4, dtype=numpy.float) epsilons = 2**numpy.arange(-6, 4, dtype=numpy.float) bestError = 10 for C in Cs: for epsilon in epsilons: svm.setEpsilon(epsilon) svm.setC(C) svm.learnModel(X, y) yp = svm.predict(X) if Evaluator.rootMeanSqError(y, yp) < bestError: bestError = Evaluator.rootMeanSqError(y, yp) self.assertTrue( bestError < Evaluator.rootMeanSqError(y, numpy.zeros(y.shape[0]))) svm.setSvmType("C_SVC") svm.learnModel(X, y2) yp2 = svm.predict(X) self.assertTrue(0 <= Evaluator.binaryError(y2, yp2) <= 1)
def testParallelModelSelect(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx, paramDict) tol = 10**-6 bestError = 1 meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) print("Computing real grid") for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] error = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] testX = self.X[testInds, :] testY = self.y[testInds] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(testX) error += Evaluator.binaryError(predY, testY) meanErrors2[i, j] = error/len(idx) if error < bestError: bestC = C bestGamma = gamma bestError = error self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
def testSetSvmType(self): try: import sklearn except ImportError as error: return numExamples = 100 numFeatures = 10 X = numpy.random.randn(numExamples, numFeatures) X = Standardiser().standardiseArray(X) c = numpy.random.randn(numFeatures) y = numpy.dot(X, numpy.array([c]).T).ravel() + 1 y2 = numpy.array(y > 0, numpy.int32)*2 -1 svm = LibSVM() svm.setSvmType("Epsilon_SVR") self.assertEquals(svm.getType(), "Epsilon_SVR") #Try to get a good error Cs = 2**numpy.arange(-6, 4, dtype=numpy.float) epsilons = 2**numpy.arange(-6, 4, dtype=numpy.float) bestError = 10 for C in Cs: for epsilon in epsilons: svm.setEpsilon(epsilon) svm.setC(C) svm.learnModel(X, y) yp = svm.predict(X) if Evaluator.rootMeanSqError(y, yp) < bestError: bestError = Evaluator.rootMeanSqError(y, yp) self.assertTrue(bestError < Evaluator.rootMeanSqError(y, numpy.zeros(y.shape[0]))) svm.setSvmType("C_SVC") svm.learnModel(X, y2) yp2 = svm.predict(X) self.assertTrue(0 <= Evaluator.binaryError(y2, yp2) <= 1)
def testLearnModel(self): try: import sklearn except ImportError as error: return self.svm.learnModel(self.X, self.y) predY = self.svm.classify(self.X) y = self.y e = Evaluator.binaryError(y, predY) #Test for wrong labels numExamples = 6 X = numpy.array([[-3], [-2], [-1], [1], [2] ,[3]], numpy.float) y = numpy.array([[-1], [-1], [-1], [1], [1] ,[5]]) self.assertRaises(ValueError, self.svm.learnModel, X, y) #Try the regression SVM svm = LibSVM(type="Epsilon_SVR") y = numpy.random.rand(self.X.shape[0]) svm.learnModel(self.X, self.y)
def testLearnModel(self): try: import sklearn except ImportError as error: return self.svm.learnModel(self.X, self.y) predY = self.svm.classify(self.X) y = self.y e = Evaluator.binaryError(y, predY) #Test for wrong labels numExamples = 6 X = numpy.array([[-3], [-2], [-1], [1], [2], [3]], numpy.float) y = numpy.array([[-1], [-1], [-1], [1], [1], [5]]) self.assertRaises(ValueError, self.svm.learnModel, X, y) #Try the regression SVM svm = LibSVM(type="Epsilon_SVR") y = numpy.random.rand(self.X.shape[0]) svm.learnModel(self.X, self.y)
print(numpy.sum(y==2), numpy.sum(y==0)) trainSplit = 0.3 numTrainExamples = numExamples*trainSplit trainX = X[0:numTrainExamples, :] trainY = y[0:numTrainExamples] testX = X[numTrainExamples:, :] testY = y[numTrainExamples:] learner = PenaltyDecisionTree(minSplit=1, maxDepth=50, pruning=False) learner.learnModel(trainX, trainY) predY = learner.predict(trainX) print(Evaluator.binaryError(predY, trainY)) print(learner.getTree()) plt.figure(0) plt.scatter(testX[:, 0], testX[:, 1], c=testY, s=50, vmin=0, vmax=2) plt.title("Test set") plt.colorbar() plt.figure(1) plt.scatter(trainX[:, 0], trainX[:, 1], c=trainY, s=50, vmin=0, vmax=2) plt.title("Training set") plt.colorbar() colormap = matplotlib.cm.get_cmap()
def testGrowTree(self): startId = (0, ) minSplit = 20 maxDepth = 3 gamma = 0.01 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) #Note that this matches with the case where we create a new tree each time numpy.random.seed(21) bestError = float("inf") for i in range(20): learner.tree.pruneVertex(startId) learner.growTree(trainX, trainY, argsortX, startId) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) if error < bestError: bestError = error bestTree = learner.tree.copy() self.assertTrue(learner.tree.depth() <= maxDepth) for vertexId in learner.tree.nonLeaves(): self.assertTrue( learner.tree.getVertex(vertexId).getTrainInds().shape[0] >= minSplit) bestError1 = bestError learner.tree = bestTree #Now we test growing a tree from a non-root vertex numpy.random.seed(21) for i in range(20): learner.tree.pruneVertex((0, 1)) learner.growTree(trainX, trainY, argsortX, (0, 1)) self.assertTrue( learner.tree.getVertex((0, )) == bestTree.getVertex((0, ))) self.assertTrue( learner.tree.getVertex((0, 0)) == bestTree.getVertex((0, 0))) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) if error < bestError: bestError = error bestTree = learner.tree.copy() #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) self.assertTrue(bestError1 >= bestError)
def testGrowTree(self): startId = (0, ) minSplit = 20 maxDepth = 3 gamma = 0.01 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) #Note that this matches with the case where we create a new tree each time numpy.random.seed(21) bestError = float("inf") for i in range(20): learner.tree.pruneVertex(startId) learner.growTree(trainX, trainY, argsortX, startId) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) if error < bestError: bestError = error bestTree = learner.tree.copy() self.assertTrue(learner.tree.depth() <= maxDepth) for vertexId in learner.tree.nonLeaves(): self.assertTrue(learner.tree.getVertex(vertexId).getTrainInds().shape[0] >= minSplit) bestError1 = bestError learner.tree = bestTree #Now we test growing a tree from a non-root vertex numpy.random.seed(21) for i in range(20): learner.tree.pruneVertex((0, 1)) learner.growTree(trainX, trainY, argsortX, (0, 1)) self.assertTrue(learner.tree.getVertex((0,)) == bestTree.getVertex((0,))) self.assertTrue(learner.tree.getVertex((0,0)) == bestTree.getVertex((0,0))) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) if error < bestError: bestError = error bestTree = learner.tree.copy() #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) self.assertTrue(bestError1 >= bestError )