예제 #1
0
 treeLeaveSizes = numpy.zeros(numParams)
 
 for j in range(numRealisations):
     print("")
     logging.debug("j=" + str(j))
     trainX, trainY, testX, testY = loadMethod(dataDir, datasetName, j)
     logging.debug("Loaded dataset with " + str(trainX.shape) +  " train and " + str(testX.shape) + " test examples")
     
     trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize]
     trainX = trainX[trainInds,:]
     trainY = trainY[trainInds]
     
     #logging.debug("Training set size: " + str(trainX.shape))
     methodInd = 0 
     idx = sampleMethod(folds, trainX.shape[0])
     bestLearner, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)
     predY = bestLearner.predict(testX)
     meanCvGrid[methodInd, :] += cvGrid     
     meanErrors[methodInd] += bestLearner.getMetricMethod()(testY, predY)
     meanDepths[methodInd] += bestLearner.tree.depth()
     meanSizes[methodInd] += bestLearner.tree.getNumVertices()
 
     Cvs = [-5, (folds-1)*alpha, beta[j, sampleSizeInd, :]]    
 
     #Now try penalisation
     methodInd = 1
     resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs)
     bestLearner, trainErrors, currentPenalties = resultsList[1]
     meanCvGrid[methodInd, :] += trainErrors + currentPenalties
     meanPenalties += currentPenalties
     meanTrainError += trainErrors
    def testModelSelect(self): 
        
        """
        We test the results on some data and compare to SVR. 
        """
        numExamples = 200
        X, y = data.make_regression(numExamples, noise=0.5)  
        
        X = Standardiser().standardiseArray(X)
        y = Standardiser().standardiseArray(y)
        
        trainX = X[0:100, :]
        trainY = y[0:100]
        testX = X[100:, :]
        testY = y[100:]
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV")
        learner.setPruneCV(8)
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) 
        paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int)
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
        
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART")
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) 
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
              
        return 
        #Let's compare to the SVM 
        learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") 
        
        paramDict = {} 
        paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float)
        paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float)
        paramDict["setEpsilon"] = learner2.getEpsilons()
        
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict)

        predY = bestSVM.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)