treeLeaveSizes = numpy.zeros(numParams) for j in range(numRealisations): print("") logging.debug("j=" + str(j)) trainX, trainY, testX, testY = loadMethod(dataDir, datasetName, j) logging.debug("Loaded dataset with " + str(trainX.shape) + " train and " + str(testX.shape) + " test examples") trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize] trainX = trainX[trainInds,:] trainY = trainY[trainInds] #logging.debug("Training set size: " + str(trainX.shape)) methodInd = 0 idx = sampleMethod(folds, trainX.shape[0]) bestLearner, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestLearner.predict(testX) meanCvGrid[methodInd, :] += cvGrid meanErrors[methodInd] += bestLearner.getMetricMethod()(testY, predY) meanDepths[methodInd] += bestLearner.tree.depth() meanSizes[methodInd] += bestLearner.tree.getNumVertices() Cvs = [-5, (folds-1)*alpha, beta[j, sampleSizeInd, :]] #Now try penalisation methodInd = 1 resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs) bestLearner, trainErrors, currentPenalties = resultsList[1] meanCvGrid[methodInd, :] += trainErrors + currentPenalties meanPenalties += currentPenalties meanTrainError += trainErrors
def testModelSelect(self): """ We test the results on some data and compare to SVR. """ numExamples = 200 X, y = data.make_regression(numExamples, noise=0.5) X = Standardiser().standardiseArray(X) y = Standardiser().standardiseArray(y) trainX = X[0:100, :] trainY = y[0:100] testX = X[100:, :] testY = y[100:] learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV") learner.setPruneCV(8) paramDict = {} paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int) folds = 5 idx = Sampling.crossValidation(folds, trainX.shape[0]) bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestTree.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error) learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART") paramDict = {} paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) folds = 5 idx = Sampling.crossValidation(folds, trainX.shape[0]) bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestTree.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error) return #Let's compare to the SVM learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") paramDict = {} paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float) paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float) paramDict["setEpsilon"] = learner2.getEpsilons() idx = Sampling.crossValidation(folds, trainX.shape[0]) bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestSVM.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error)