示例#1
0
    def testCrossValidation(self):
        numExamples = 10
        folds = 2

        indices = Sampling.crossValidation(folds, numExamples)

        self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4]))
        self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]))

        indices = Sampling.crossValidation(3, numExamples)

        self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([3, 4, 5, 6, 7, 8, 9], [0, 1, 2]))
        self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 6, 7, 8, 9], [3, 4, 5]))
        self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 5], [6, 7, 8, 9]))

        indices = Sampling.crossValidation(4, numExamples)

        self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([2, 3, 4, 5, 6, 7, 8, 9], [0, 1]))
        self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 5, 6, 7, 8, 9], [2, 3, 4]))
        self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 7, 8, 9], [5, 6]))
        self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 3, 4, 5, 6], [7, 8, 9]))

        indices = Sampling.crossValidation(numExamples, numExamples)
        self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0]))
        self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 2, 3, 4, 5, 6, 7, 8, 9], [1]))
        self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 3, 4, 5, 6, 7, 8, 9], [2]))
        self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 4, 5, 6, 7, 8, 9], [3]))
        self.assertEquals((list(indices[4][0]), list(indices[4][1])), ([0, 1, 2, 3, 5, 6, 7, 8, 9], [4]))

        self.assertRaises(ValueError, Sampling.crossValidation, numExamples+1, numExamples)
        self.assertRaises(ValueError, Sampling.crossValidation, 0, numExamples)
        self.assertRaises(ValueError, Sampling.crossValidation, -1, numExamples)
        self.assertRaises(ValueError, Sampling.crossValidation, folds, 1)
示例#2
0
    def testSampleUsers(self): 
        m = 10
        n = 15
        r = 5 
        u = 0.3
        w = 1-u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)

        k = 50
        X2, userInds = Sampling.sampleUsers(X, k)

        nptst.assert_array_equal(X.toarray(), X2.toarray())
        
        numRuns = 50
        for i in range(numRuns): 
            m = numpy.random.randint(10, 100)
            n = numpy.random.randint(10, 100)
            k = numpy.random.randint(10, 100)

            X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)

            X2, userInds = Sampling.sampleUsers(X, k)
            
            self.assertEquals(X2.shape[0], min(k, m))
            self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all()))
            self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all())
            self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
示例#3
0
    def testRepCrossValidation(self): 
        numExamples = 10
        folds = 3
        repetitions = 1

        indices = Sampling.repCrossValidation(folds, numExamples, repetitions)
        
        for i in range(folds):
            self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
        
        repetitions = 2
        indices = Sampling.repCrossValidation(folds, numExamples, repetitions)
        
        for i in range(folds):
            self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
示例#4
0
    def testParallelVfPenRbf2(self):
        #Test support vector regression
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")
        svm.setSvmType("Epsilon_SVR")
        resultsList = svm.parallelVfPenRbf(self.X,
                                           self.y,
                                           idx,
                                           Cv,
                                           type="Epsilon_SVR")

        tol = 10**-6
        bestError = 100
        meanErrors2 = numpy.zeros(
            (svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                for k in range(svm.epsilons.shape[0]):
                    epsilon = svm.epsilons[k]

                    penalty = 0
                    for trainInds, testInds in idx:
                        trainX = self.X[trainInds, :]
                        trainY = self.y[trainInds]

                        svm.setGamma(gamma)
                        svm.setC(C)
                        svm.setEpsilon(epsilon)
                        svm.learnModel(trainX, trainY)
                        predY = svm.predict(self.X)
                        predTrainY = svm.predict(trainX)
                        penalty += svm.getMetricMethod()(
                            predY, self.y) - svm.getMetricMethod()(predTrainY,
                                                                   trainY)

                    penalty = penalty * Cv[0] / len(idx)
                    svm.learnModel(self.X, self.y)
                    predY = svm.predict(self.X)
                    meanErrors2[j, k, i] = svm.getMetricMethod()(
                        predY, self.y) + penalty

                    if meanErrors2[j, k, i] < bestError:
                        bestC = C
                        bestGamma = gamma
                        bestEpsilon = epsilon
                        bestError = meanErrors2[j, k, i]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertEquals(bestEpsilon, bestSVM.getEpsilon())
        self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
示例#5
0
 def testParallelPen(self): 
     #Check if penalisation == inf when treeSize < gamma 
     numExamples = 100
     X, y = data.make_regression(numExamples) 
     learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2)
     
     paramDict = {} 
     paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 10, 0.5)-1), dtype=numpy.int)
     
     folds = 3
     alpha = 1.0
     Cvs = numpy.array([(folds-1)*alpha])
     
     idx = Sampling.crossValidation(folds, X.shape[0])
     
     resultsList = learner.parallelPen(X, y, idx, paramDict, Cvs)
     
     learner, trainErrors, currentPenalties = resultsList[0]
     
     learner.setGamma(2**10)
     treeSize = 0
     #Let's work out the size of the unpruned tree 
     for trainInds, testInds in idx: 
         trainX = X[trainInds, :]
         trainY = y[trainInds]
         
         learner.learnModel(trainX, trainY)
         treeSize += learner.tree.size 
     
     treeSize /= float(folds)         
     
     self.assertTrue(numpy.isinf(currentPenalties[paramDict["setGamma"]>treeSize]).all())      
     self.assertTrue(not numpy.isinf(currentPenalties[paramDict["setGamma"]<treeSize]).all())
 def profileModelSelect(self):
     lmbdas = numpy.linspace(1.0, 0.01, 5)
     softImpute = IterativeSoftImpute(k=500)
     
     folds = 5
     cvInds = Sampling.randCrossValidation(folds, self.X.nnz)
     ProfileUtils.profile('softImpute.modelSelect(self.X, lmbdas, cvInds)', globals(), locals())
示例#7
0
    def testAverageRocCurve(self):
        m = 50
        n = 20
        k = 8
        u = 20.0 / m
        w = 1 - u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix(
            (m, n), k, w, csarray=True, verbose=True, indsPerRow=200
        )

        fpr, tpr = MCEvaluator.averageRocCurve(X, U, V)

        import matplotlib

        matplotlib.use("GTK3Agg")
        import matplotlib.pyplot as plt

        # plt.plot(fpr, tpr)
        # plt.show()

        # Now try case where we have a training set
        folds = 1
        testSize = 5
        trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)
        trainX, testX = trainTestXs[0]

        fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
示例#8
0
    def testAverageRocCurve(self):
        m = 50
        n = 20
        k = 8
        u = 20.0 / m
        w = 1 - u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n),
                                                                k,
                                                                w,
                                                                csarray=True,
                                                                verbose=True,
                                                                indsPerRow=200)

        fpr, tpr = MCEvaluator.averageRocCurve(X, U, V)

        import matplotlib
        matplotlib.use("GTK3Agg")
        import matplotlib.pyplot as plt
        #plt.plot(fpr, tpr)
        #plt.show()

        #Now try case where we have a training set
        folds = 1
        testSize = 5
        trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)
        trainX, testX = trainTestXs[0]

        fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
    def cvModelSelection(self, graph, paramList, paramFunc, folds, errorFunc):
        """
        ParamList is a list of lists of parameters and paramFunc
        is a list of the corresponding functions to call with the parameters
        as arguments. Note that a parameter can also be a tuple which is expanded
        out before the function is called. 

        e.g.
        paramList = [[1, 2], [2, 1], [12, 1]]
        paramFunc = [predictor.setC, predictor.setD]
        """

        inds = Sampling.crossValidation(folds, graph.getNumEdges())
        errors = numpy.zeros((len(paramList), folds))
        allEdges = graph.getAllEdges()

        for i in range(len(paramList)):
            paramSet = paramList[i]
            logging.debug("Using paramSet=" + str(paramSet))

            for j in range(len(paramSet)):
                if type(paramSet[j]) == tuple:
                    paramFunc[j](*paramSet[j])
                else:
                    paramFunc[j](paramSet[j])

            predY = numpy.zeros(0)
            y = numpy.zeros(0)
            j = 0

            for (trainInds, testInds) in inds:
                trainEdges = allEdges[trainInds, :]
                testEdges = allEdges[testInds, :]

                trainGraph = SparseGraph(graph.getVertexList(),
                                         graph.isUndirected())
                trainGraph.addEdges(trainEdges,
                                    graph.getEdgeValues(trainEdges))

                testGraph = SparseGraph(graph.getVertexList(),
                                        graph.isUndirected())
                testGraph.addEdges(testEdges, graph.getEdgeValues(testEdges))

                self.learnModel(trainGraph)

                predY = self.predictEdges(testGraph, testGraph.getAllEdges())
                y = testGraph.getEdgeValues(testGraph.getAllEdges())
                #Note that the order the edges is different in testGraphs as
                #opposed to graph when calling getAllEdges()

                errors[i, j] = errorFunc(y, predY)
                j = j + 1

            logging.info("Error of current fold: " +
                         str(numpy.mean(errors[i, :])))

        meanErrors = numpy.mean(errors, 1)
        strErrors = numpy.std(errors, 1)

        return meanErrors, strErrors
示例#10
0
    def testParallelPen(self):
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {}
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()

        resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv)

        tol = 10**-6
        bestError = 1
        trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                penalty = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(self.X)
                    predTrainY = svm.predict(trainX)
                    penalty += Evaluator.binaryError(
                        predY, self.y) - Evaluator.binaryError(
                            predTrainY, trainY)

                penalty = penalty * Cv[0] / len(idx)
                svm.learnModel(self.X, self.y)
                predY = svm.predict(self.X)
                trainErrors2[i, j] = Evaluator.binaryError(predY, self.y)
                penalties2[i, j] = penalty
                meanErrors2[i,
                            j] = Evaluator.binaryError(predY, self.y) + penalty

                if meanErrors2[i, j] < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = meanErrors2[i, j]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
        self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol)
        self.assertTrue(
            numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
示例#11
0
    def parallelGridSearch(self,
                           X,
                           paramDict,
                           evaluationMethod,
                           testX=None,
                           minVal=True):
        """
        Perform parallel model selection using any learner. 
        """
        logging.debug("Parallel grid search with params: " + str(paramDict))

        m, n = X.shape
        if testX == None:
            trainTestXs = Sampling.shuffleSplitRows(X, self.folds,
                                                    self.validationSize)
        else:
            trainTestXs = [[X, testX]]

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        meanMetrics = numpy.zeros(tuple(gridSize))
        paramList = []

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)

            for inds in indexIter:
                learner = self.copy()

                for i, (key, val) in enumerate(paramDict.items()):
                    setattr(learner, key, val[inds[i]])

                paramList.append((trainX, testX, learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList,
                                        self.chunkSize)
        else:
            resultsIterator = itertools.imap(evaluationMethod, paramList)

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)
            for inds in indexIter:
                metric = resultsIterator.next()
                meanMetrics[inds] += metric / float(self.folds)

        if self.numProcesses != 1:
            pool.terminate()

        resultDict, bestMetric = self.setBestLearner(meanMetrics, paramDict,
                                                     minVal)

        return meanMetrics
    def profileModelSelect(self):
        lmbdas = numpy.linspace(1.0, 0.01, 5)
        softImpute = IterativeSoftImpute(k=500)

        folds = 5
        cvInds = Sampling.randCrossValidation(folds, self.X.nnz)
        ProfileUtils.profile('softImpute.modelSelect(self.X, lmbdas, cvInds)',
                             globals(), locals())
示例#13
0
    def testShuffleSplit(self):
        numExamples = 10
        folds = 5

        indices = Sampling.shuffleSplit(folds, numExamples)
        
        for i in range(folds):
            self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
        
        indices = Sampling.shuffleSplit(folds, numExamples, 0.5)
        trainSize = numExamples*0.5

        for i in range(folds):
            self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
            self.assertTrue(indices[i][0].shape[0] == trainSize)

        indices = Sampling.shuffleSplit(folds, numExamples, 0.55)
示例#14
0
    def testSampleUsers2(self): 
        m = 10
        n = 15
        r = 5 
        u = 0.3
        w = 1-u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)

        k = X.nnz+100
        X2, userInds = Sampling.sampleUsers2(X, k)

        nptst.assert_array_equal(X.toarray(), X2.toarray())
        
        #Test pruning of cols 
        k = 500
        m = 100
        n = 500
        u = 0.1
        w = 1 - u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)
        numpy.random.seed(21)
        X2, userInds = Sampling.sampleUsers2(X, k, prune=True)
        nnz1 = X2.nnz
        self.assertTrue((X2.sum(0)!=0).all())

        numpy.random.seed(21)
        X2, userInds = Sampling.sampleUsers2(X, k, prune=False)
        nnz2 = X2.nnz
        self.assertEquals(nnz1, nnz2)

        numRuns = 50
        for i in range(numRuns): 
            m = numpy.random.randint(10, 100)
            n = numpy.random.randint(10, 100)
            k = 500

            X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200)

            X2, userInds = Sampling.sampleUsers2(X, k)
            

            self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all()))
            self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all())
            self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
示例#15
0
 def cvPrune(self, validX, validY): 
     """
     We do something like reduced error pruning but we use cross validation 
     to decide which nodes to prune. 
     """
     
     #First set the value of the vertices using the training set. 
     #Reset all alphas to zero 
     inds = Sampling.crossValidation(self.folds, validX.shape[0])
     
     for i in self.tree.getAllVertexIds(): 
         self.tree.getVertex(i).setAlpha(0.0)
         self.tree.getVertex(i).setTestError(0.0)
     
     for trainInds, testInds in inds:             
         rootId = (0,)
         root = self.tree.getVertex(rootId)
         root.setTrainInds(trainInds)
         root.setTestInds(testInds)
         root.tempValue = numpy.mean(validY[trainInds])
         
         nodeStack = [(rootId, root.tempValue)]
         
         while len(nodeStack) != 0: 
             (nodeId, value) = nodeStack.pop()
             node = self.tree.getVertex(nodeId)
             tempTrainInds = node.getTrainInds()
             tempTestInds = node.getTestInds()
             node.setTestError(numpy.sum((validY[tempTestInds] - node.tempValue)**2) + node.getTestError())
             childIds = [self.getLeftChildId(nodeId), self.getRightChildId(nodeId)]
             
             for childId in childIds:                 
                 if self.tree.vertexExists(childId): 
                     child = self.tree.getVertex(childId)
                     
                     if childId[-1] == 0: 
                         childInds = validX[tempTrainInds, node.getFeatureInd()] < node.getThreshold()
                     else: 
                         childInds = validX[tempTrainInds, node.getFeatureInd()] >= node.getThreshold()
                     
                     if childInds.sum() !=0:   
                         value = numpy.mean(validY[tempTrainInds[childInds]])
                         
                     child.tempValue = value 
                     child.setTrainInds(tempTrainInds[childInds])
                     nodeStack.append((childId, value))
                     
                     if childId[-1] == 0: 
                         childInds = validX[tempTestInds, node.getFeatureInd()] < node.getThreshold() 
                     else: 
                         childInds = validX[tempTestInds, node.getFeatureInd()] >= node.getThreshold()  
                      
                     child.setTestInds(tempTestInds[childInds])
     
     self.computeAlphas()
     self.prune()
    def cvModelSelection(self, graph, paramList, paramFunc, folds, errorFunc):
        """
        ParamList is a list of lists of parameters and paramFunc
        is a list of the corresponding functions to call with the parameters
        as arguments. Note that a parameter can also be a tuple which is expanded
        out before the function is called. 

        e.g.
        paramList = [[1, 2], [2, 1], [12, 1]]
        paramFunc = [predictor.setC, predictor.setD]
        """

        inds = Sampling.crossValidation(folds, graph.getNumEdges())
        errors = numpy.zeros((len(paramList), folds))
        allEdges = graph.getAllEdges()

        for i in range(len(paramList)):
            paramSet = paramList[i]
            logging.debug("Using paramSet=" + str(paramSet))

            for j in range(len(paramSet)):
                if type(paramSet[j]) == tuple:
                    paramFunc[j](*paramSet[j])
                else: 
                    paramFunc[j](paramSet[j])

            predY = numpy.zeros(0)
            y = numpy.zeros(0)
            j = 0 

            for (trainInds, testInds) in inds:
                trainEdges = allEdges[trainInds, :]
                testEdges = allEdges[testInds, :]

                trainGraph = SparseGraph(graph.getVertexList(), graph.isUndirected())
                trainGraph.addEdges(trainEdges, graph.getEdgeValues(trainEdges))

                testGraph = SparseGraph(graph.getVertexList(), graph.isUndirected())
                testGraph.addEdges(testEdges, graph.getEdgeValues(testEdges))

                self.learnModel(trainGraph)

                predY = self.predictEdges(testGraph, testGraph.getAllEdges())
                y = testGraph.getEdgeValues(testGraph.getAllEdges())
                #Note that the order the edges is different in testGraphs as
                #opposed to graph when calling getAllEdges()

                errors[i, j] = errorFunc(y, predY)
                j = j+1 

            logging.info("Error of current fold: " + str(numpy.mean(errors[i, :])))

        meanErrors = numpy.mean(errors, 1)
        strErrors = numpy.std(errors, 1)

        return meanErrors, strErrors
示例#17
0
    def modelSelect(self, X, colProbs=None): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        #cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs)
        testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs)))
        
        if self.metric == "mrr":
            evaluationMethod = computeTestMRR
        elif self.metric == "f1": 
            evaluationMethod = computeTestF1
        else: 
            raise ValueError("Invalid metric: " + self.metric)        
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for i, k in enumerate(self.ks): 
            for j, lmbda in enumerate(self.lmbdas): 
                for icv, (trainX, testX) in enumerate(trainTestXs):                
                    learner = self.copy()
                    learner.k = k
                    learner.lmbda = lmbda 
                
                    paramList.append((trainX.toScipyCsr(), testX.toScipyCsr(), learner))
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
        else: 
            import itertools
            resultsIterator = itertools.imap(evaluationMethod, paramList)
        
        for i, k in enumerate(self.ks):
            for j, lmbda in enumerate(self.lmbdas):
                for icv in range(len(trainTestXs)):             
                    testMetrics[i, j, icv] = resultsIterator.next()
        
        if self.numProcesses != 1: 
            pool.terminate()
            
        meanTestMetrics= numpy.mean(testMetrics, 2)
        stdTestMetrics = numpy.std(testMetrics, 2)
        
        logging.debug("ks=" + str(self.ks)) 
        logging.debug("lmbdas=" + str(self.lmbdas)) 
        logging.debug("Mean metrics=" + str(meanTestMetrics))
        
        self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]]
        self.lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]]

        logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda))
         
        return meanTestMetrics, stdTestMetrics
示例#18
0
    def testBootstrap2(self):
        numExamples = 10
        folds = 2

        indices = Sampling.bootstrap2(folds, numExamples)

        for i in range(folds):
            self.assertEquals(indices[i][0].shape[0], numExamples)
            self.assertTrue(indices[i][1].shape[0] < numExamples)
            self.assertTrue((numpy.union1d(indices[0][0], indices[0][1]) == numpy.arange(numExamples)).all())
示例#19
0
    def testParallelPen(self): 
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()            
        
        resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv)
        
        tol = 10**-6
        bestError = 1
        trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                penalty = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(self.X)
                    predTrainY = svm.predict(trainX)
                    penalty += Evaluator.binaryError(predY, self.y) - Evaluator.binaryError(predTrainY, trainY)

                penalty = penalty*Cv[0]/len(idx)
                svm.learnModel(self.X, self.y)
                predY = svm.predict(self.X)
                trainErrors2[i, j] = Evaluator.binaryError(predY, self.y)
                penalties2[i, j] = penalty
                meanErrors2[i, j] = Evaluator.binaryError(predY, self.y) + penalty

                if meanErrors2[i, j] < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = meanErrors2[i, j]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
        self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol)
        self.assertTrue(numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
示例#20
0
    def modelSelect(self, X):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        localAucs = numpy.zeros(
            (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds)))

        logging.debug("Performing model selection")
        paramList = []

        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            testOmegaList = SparseUtils.getOmegaList(testX)

            for i, k in enumerate(self.ks):
                maxLocalAuc = self.copy()
                maxLocalAuc.k = k
                paramList.append((trainX, testX, testOmegaList, maxLocalAuc))

        pool = multiprocessing.Pool(processes=self.numProcesses,
                                    maxtasksperchild=100)
        resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize)
        #import itertools
        #resultsIterator = itertools.imap(localAucsLmbdas, paramList)

        for icv, (trainInds, testInds) in enumerate(cvInds):
            for i, k in enumerate(self.ks):
                tempAucs = resultsIterator.next()
                localAucs[i, :, icv] = tempAucs

        pool.terminate()

        meanLocalAucs = numpy.mean(localAucs, 2)
        stdLocalAucs = numpy.std(localAucs, 2)

        logging.debug(meanLocalAucs)

        k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs),
                                        meanLocalAucs.shape)[0]]
        lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs),
                                                meanLocalAucs.shape)[1]]

        logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda))

        self.k = k
        self.lmbda = lmbda

        return meanLocalAucs, stdLocalAucs
示例#21
0
    def generateLearner(self, X, y):
        """
        Train using the given examples and labels, and use model selection to
        find the best parameters.
        """
        if numpy.unique(y).shape[0] != 2:
            print(y)
            raise ValueError("Can only operate on binary data")

        #Do model selection first 
        if self.sampleSize == None: 
            idx = Sampling.crossValidation(self.folds, X.shape[0])
            learner, meanErrors = self.parallelModelSelect(X, y, idx, self.paramDict)
        else: 
            idx = Sampling.crossValidation(self.folds, self.sampleSize)
            inds = numpy.random.permutation(X.shape[0])[0:self.sampleSize]
            learner, meanErrors = self.parallelModelSelect(X[inds, :], y[inds], idx, self.paramDict)
            learner = self.getBestLearner(meanErrors, self.paramDict, X, y)
        
        return learner
示例#22
0
 def getDataset(dataset, nnz=20000): 
     """
     Return a dataset by name
     """        
     
     if dataset == "synthetic": 
         X, U, V = DatasetUtils.syntheticDataset1()
     elif dataset == "synthetic2": 
         X = DatasetUtils.syntheticDataset2()
     elif dataset == "movielens": 
         X = DatasetUtils.movieLens()
     elif dataset == "epinions": 
         X = DatasetUtils.epinions()
         X, userInds = Sampling.sampleUsers2(X, nnz, prune=True)    
     elif dataset == "flixster": 
         X = DatasetUtils.flixster()
         X, userInds = Sampling.sampleUsers2(X, nnz, prune=True)
     else: 
         raise ValueError("Unknown dataset: " + dataset)
         
     return X
示例#23
0
    def evaluateCv(self, X, y, folds, metricMethod=Evaluator.binaryError):
        """
        Compute the cross validation according to a given metric. 
        """
        Parameter.checkInt(folds, 2, float('inf'))
        idx = Sampling.crossValidation(folds, y.shape[0])
        metrics = AbstractPredictor.evaluateLearn(X, y, idx, self.learnModel, self.predict, metricMethod)

        mean = numpy.mean(metrics, 0)
        var = numpy.var(metrics, 0)

        return (mean, var)
示例#24
0
    def testParallelVfPenRbf2(self):
        #Test support vector regression 
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")
        svm.setSvmType("Epsilon_SVR")
        resultsList = svm.parallelVfPenRbf(self.X, self.y, idx, Cv, type="Epsilon_SVR")

        tol = 10**-6 
        bestError = 100
        meanErrors2 = numpy.zeros((svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                for k in range(svm.epsilons.shape[0]):
                    epsilon = svm.epsilons[k]
                    
                    penalty = 0
                    for trainInds, testInds in idx:
                        trainX = self.X[trainInds, :]
                        trainY = self.y[trainInds]

                        svm.setGamma(gamma)
                        svm.setC(C)
                        svm.setEpsilon(epsilon)
                        svm.learnModel(trainX, trainY)
                        predY = svm.predict(self.X)
                        predTrainY = svm.predict(trainX)
                        penalty += svm.getMetricMethod()(predY, self.y) - svm.getMetricMethod()(predTrainY, trainY)

                    penalty = penalty*Cv[0]/len(idx)
                    svm.learnModel(self.X, self.y)
                    predY = svm.predict(self.X)
                    meanErrors2[j, k, i] = svm.getMetricMethod()(predY, self.y) + penalty

                    if meanErrors2[j, k, i] < bestError:
                        bestC = C
                        bestGamma = gamma
                        bestEpsilon = epsilon 
                        bestError = meanErrors2[j, k, i]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertEquals(bestEpsilon, bestSVM.getEpsilon())
        self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
示例#25
0
    def evaluateCv(self, X, y, folds, metricMethod=Evaluator.binaryError):
        """
        Compute the cross validation according to a given metric. 
        """
        Parameter.checkInt(folds, 2, float('inf'))
        idx = Sampling.crossValidation(folds, y.shape[0])
        metrics = AbstractPredictor.evaluateLearn(X, y, idx, self.learnModel,
                                                  self.predict, metricMethod)

        mean = numpy.mean(metrics, 0)
        var = numpy.var(metrics, 0)

        return (mean, var)
示例#26
0
    def testParallelPenaltyGrid(self): 
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        randomForest = RandomForest()
        
        trainX = self.X[0:40, :]
        trainY = self.y[0:40]
        
        paramDict = {} 
        paramDict["setMinSplit"] = randomForest.getMinSplits()
        paramDict["setMaxDepth"] = randomForest.getMaxDepths()      

        idealPenalties = randomForest.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ext
    
    print("Loading data from file " + fileName)
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/"

    XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters)
    X = XY[:, :-1]
    y = XY[:, -1]
    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)
    preprocessSave(X, y, outputDir, idx)
示例#28
0
    def generateLearner(self, X, y):
        """
        Train using the given examples and labels, and use model selection to
        find the best parameters.
        """
        if numpy.unique(y).shape[0] != 2:
            print(y)
            raise ValueError("Can only operate on binary data")

        #Do model selection first
        if self.sampleSize == None:
            idx = Sampling.crossValidation(self.folds, X.shape[0])
            learner, meanErrors = self.parallelModelSelect(
                X, y, idx, self.paramDict)
        else:
            idx = Sampling.crossValidation(self.folds, self.sampleSize)
            inds = numpy.random.permutation(X.shape[0])[0:self.sampleSize]
            learner, meanErrors = self.parallelModelSelect(
                X[inds, :], y[inds], idx, self.paramDict)
            learner = self.getBestLearner(meanErrors, self.paramDict, X, y)

        return learner
示例#29
0
    def testParallelPenaltyGrid(self):
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        randomForest = RandomForest()

        trainX = self.X[0:40, :]
        trainY = self.y[0:40]

        paramDict = {}
        paramDict["setMinSplit"] = randomForest.getMinSplits()
        paramDict["setMaxDepth"] = randomForest.getMaxDepths()

        idealPenalties = randomForest.parallelPenaltyGrid(
            trainX, trainY, self.X, self.y, paramDict)
示例#30
0
    def parallelGridSearch(self, X, paramDict, evaluationMethod, testX=None, minVal=True):
        """
        Perform parallel model selection using any learner. 
        """
        logging.debug("Parallel grid search with params: " + str(paramDict))

        m, n = X.shape
        if testX == None:
            trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize)
        else:
            trainTestXs = [[X, testX]]

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        meanMetrics = numpy.zeros(tuple(gridSize))
        paramList = []

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)

            for inds in indexIter:
                learner = self.copy()

                for i, (key, val) in enumerate(paramDict.items()):
                    setattr(learner, key, val[inds[i]])

                paramList.append((trainX, testX, learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
        else:
            resultsIterator = itertools.imap(evaluationMethod, paramList)

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)
            for inds in indexIter:
                metric = resultsIterator.next()
                meanMetrics[inds] += metric / float(self.folds)

        if self.numProcesses != 1:
            pool.terminate()

        resultDict, bestMetric = self.setBestLearner(meanMetrics, paramDict, minVal)

        return meanMetrics
示例#31
0
    def testParallelVfcvRbf2(self):
        #In this test we try SVM regression
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")
        svm.setSvmType("Epsilon_SVR")
        bestSVM, meanErrors = svm.parallelVfcvRbf(self.X,
                                                  self.y,
                                                  idx,
                                                  type="Epsilon_SVR")

        tol = 10**-6
        bestError = 100
        meanErrors2 = numpy.zeros(
            (svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                for k in range(svm.epsilons.shape[0]):
                    epsilon = svm.epsilons[k]

                    error = 0
                    for trainInds, testInds in idx:
                        trainX = self.X[trainInds, :]
                        trainY = self.y[trainInds]
                        testX = self.X[testInds, :]
                        testY = self.y[testInds]

                        svm.setGamma(gamma)
                        svm.setC(C)
                        svm.setEpsilon(epsilon)
                        svm.learnModel(trainX, trainY)
                        predY = svm.predict(testX)
                        error += svm.getMetricMethod()(predY, testY)

                    meanErrors2[j, k, i] = error / len(idx)

                    if error < bestError:
                        bestC = C
                        bestGamma = gamma
                        bestError = error
                        bestEpsilon = epsilon

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertEquals(bestEpsilon, bestSVM.getEpsilon())
        self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
示例#32
0
    def testParallelModelSelect(self):
        X = scipy.sparse.rand(10, 10, 0.5)
        X = X.tocsr()

        numExamples = X.getnnz()
        paramDict = {}
        paramDict["setRank"] = numpy.array([5, 10, 20])
        folds = 3
        idx = Sampling.randCrossValidation(folds, numExamples)

        method = "lsnmf"
        nimfaFactorise = NimfaFactorise(method)
        learner, meanErrors = nimfaFactorise.parallelModelSelect(
            X, idx, paramDict)
示例#33
0
 def testParallelModelSelect(self): 
     X = scipy.sparse.rand(10, 10, 0.5)
     X = X.tocsr()
       
     numExamples = X.getnnz()
     paramDict = {}
     paramDict["setRank"] = numpy.array([5, 10, 20])
     folds = 3 
     idx = Sampling.randCrossValidation(folds, numExamples)
             
     
     method = "lsnmf"
     nimfaFactorise = NimfaFactorise(method)
     learner, meanErrors = nimfaFactorise.parallelModelSelect(X, idx, paramDict)
示例#34
0
    def modelSelect(self, X): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)
            
            testOmegaList = SparseUtils.getOmegaList(testX)
            
            for i, k in enumerate(self.ks): 
                maxLocalAuc = self.copy()
                maxLocalAuc.k = k
                paramList.append((trainX, testX, testOmegaList, maxLocalAuc))
                    
        pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
        resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize)
        #import itertools
        #resultsIterator = itertools.imap(localAucsLmbdas, paramList)
        
        for icv, (trainInds, testInds) in enumerate(cvInds):        
            for i, k in enumerate(self.ks): 
                tempAucs = resultsIterator.next()
                localAucs[i, :, icv] = tempAucs
        
        pool.terminate()
        
        meanLocalAucs = numpy.mean(localAucs, 2)
        stdLocalAucs = numpy.std(localAucs, 2)
        
        logging.debug(meanLocalAucs)
        
        k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]]
        lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]]
        
        logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda))
        
        self.k = k 
        self.lmbda = lmbda 
        
        return meanLocalAucs, stdLocalAucs
示例#35
0
    def testParallelPenaltyGrid(self): 
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        decisionTree = DecisionTree()
        bestLearner, meanErrors = decisionTree.parallelVfcv(self.X, self.y, idx)
        
        trainX = self.X[0:40, :]
        trainY = self.y[0:40]
        
        paramDict = {} 
        paramDict["setMinSplit"] = decisionTree.getMinSplits()
        paramDict["setMaxDepth"] = decisionTree.getMaxDepths()      

        idealPenalties = decisionTree.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
示例#36
0
    def modelSelect(self, X): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        precisions = numpy.zeros((self.ks.shape[0], len(cvInds)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, self.folds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)
            
            testOmegaList = SparseUtils.getOmegaList(testX)
            
            for i, k in enumerate(self.ks): 
                learner = self.copy()
                learner.k = k
                paramList.append((trainX, testX, testOmegaList, learner))
                    
        #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
        #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize)
        import itertools
        resultsIterator = itertools.imap(computePrecision, paramList)
        
        for icv, (trainInds, testInds) in enumerate(cvInds):        
            for i, k in enumerate(self.ks): 
                tempPrecision = resultsIterator.next()
                precisions[i, icv] = tempPrecision
        
        #pool.terminate()
        
        meanPrecisions = numpy.mean(precisions, 1)
        stdPrecisions = numpy.std(precisions, 1)
        
        logging.debug(meanPrecisions)
        
        k = self.ks[numpy.argmax(meanPrecisions)]

        
        logging.debug("Model parameters: k=" + str(k)) 
        
        self.k = k 
        
        return meanPrecisions, stdPrecisions
    def testModelSelect(self):
        lmbda = 0.1
        shape = (20, 20) 
        r = 20 
        numInds = 100
        noise = 0.2
        X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise)
        
        U, s, V = numpy.linalg.svd(X.todense())

        k = 15

        iterativeSoftImpute = IterativeSoftImpute(lmbda, k=None, svdAlg="propack", updateAlg="zero")
        iterativeSoftImpute.numProcesses = 1
        rhos = numpy.linspace(0.5, 0.001, 20)
        ks = numpy.array([k], numpy.int)
        folds = 3
        cvInds = Sampling.randCrossValidation(folds, X.nnz)
        meanTestErrors, meanTrainErrors = iterativeSoftImpute.modelSelect(X, rhos, ks, cvInds)

        #Now do model selection manually 
        (rowInds, colInds) = X.nonzero()
        trainErrors = numpy.zeros((rhos.shape[0], len(cvInds)))
        testErrors = numpy.zeros((rhos.shape[0], len(cvInds)))
        
        for i, rho in enumerate(rhos): 
            for j, (trainInds, testInds) in enumerate(cvInds): 
                trainX = scipy.sparse.csc_matrix(X.shape)
                testX = scipy.sparse.csc_matrix(X.shape)
                
                for p in trainInds: 
                    trainX[rowInds[p], colInds[p]] = X[rowInds[p], colInds[p]]
                    
                for p in testInds: 
                    testX[rowInds[p], colInds[p]] = X[rowInds[p], colInds[p]]
                                 
                softImpute = SoftImpute(numpy.array([rho]), k=ks[0]) 
                ZList = [softImpute.learnModel(trainX, fullMatrices=False)]
                
                predTrainX = softImpute.predict(ZList, trainX.nonzero())[0]
                predX = softImpute.predict(ZList, testX.nonzero())[0]

                testErrors[i, j] = MCEvaluator.rootMeanSqError(testX, predX)
                trainErrors[i, j] = MCEvaluator.rootMeanSqError(trainX, predTrainX)
        
        meanTestErrors2 = testErrors.mean(1)   
        meanTrainErrors2 = trainErrors.mean(1)  
        
        nptst.assert_array_almost_equal(meanTestErrors.ravel(), meanTestErrors2, 1) 
示例#38
0
    def testParallelVfcvRbf2(self):
        #In this test we try SVM regression 
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")
        svm.setSvmType("Epsilon_SVR")
        bestSVM, meanErrors = svm.parallelVfcvRbf(self.X, self.y, idx, type="Epsilon_SVR")

        tol = 10**-6
        bestError = 100
        meanErrors2 = numpy.zeros((svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                for k in range(svm.epsilons.shape[0]):
                    epsilon = svm.epsilons[k]

                    error = 0
                    for trainInds, testInds in idx:
                        trainX = self.X[trainInds, :]
                        trainY = self.y[trainInds]
                        testX = self.X[testInds, :]
                        testY = self.y[testInds]

                        svm.setGamma(gamma)
                        svm.setC(C)
                        svm.setEpsilon(epsilon)
                        svm.learnModel(trainX, trainY)
                        predY = svm.predict(testX)
                        error += svm.getMetricMethod()(predY, testY)

                    meanErrors2[j, k, i] = error/len(idx)

                    if error < bestError:
                        bestC = C
                        bestGamma = gamma
                        bestError = error
                        bestEpsilon = epsilon

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertEquals(bestEpsilon, bestSVM.getEpsilon())
        self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
示例#39
0
    def testParallelModelSelect(self):
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {}
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()

        bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx,
                                                      paramDict)

        tol = 10**-6
        bestError = 1
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        print("Computing real grid")

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                error = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]
                    testX = self.X[testInds, :]
                    testY = self.y[testInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(testX)
                    error += Evaluator.binaryError(predY, testY)

                meanErrors2[i, j] = error / len(idx)

                if error < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = error

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
示例#40
0
    def testParallelModelSelect(self):
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()    
        
        bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx, paramDict)
        
        tol = 10**-6 
        bestError = 1
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) 
        print("Computing real grid")

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                error = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]
                    testX = self.X[testInds, :]
                    testY = self.y[testInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(testX)
                    error += Evaluator.binaryError(predY, testY)

                meanErrors2[i, j] = error/len(idx)

                if error < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = error
            
        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
示例#41
0
    def testGetBestLearner(self):
        svm = self.svm
        paramDict = {}
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()

        errors = numpy.random.rand(svm.getCs().shape[0],
                                   svm.getGammas().shape[0])

        folds = 5
        idx = Sampling.crossValidation(folds, self.X.shape[0])

        svm.normModelSelect = True
        svm.setKernel("gaussian")
        learner = svm.getBestLearner(errors, paramDict, self.X, self.y, idx)

        bestC = learner.getC()

        #Find the best norm
        bestInds = numpy.unravel_index(numpy.argmin(errors), errors.shape)
        learner.setC(svm.getCs()[bestInds[0]])
        learner.setGamma(svm.getGammas()[bestInds[1]])

        norms = []
        for trainInds, testInds in idx:
            validX = self.X[trainInds, :]
            validY = self.y[trainInds]
            learner.learnModel(validX, validY)

            norms.append(learner.weightNorm())

        bestNorm = numpy.array(norms).mean()

        norms = numpy.zeros(paramDict["setC"].shape[0])
        for i, C in enumerate(paramDict["setC"]):
            learner.setC(C)
            learner.learnModel(self.X, self.y)
            norms[i] = learner.weightNorm()

        bestC2 = paramDict["setC"][numpy.abs(norms - bestNorm).argmin()]

        self.assertEquals(bestC, bestC2)
示例#42
0
    def testGetBestLearner(self): 
        svm = self.svm
        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()      

        errors = numpy.random.rand(svm.getCs().shape[0], svm.getGammas().shape[0])

        folds = 5 
        idx = Sampling.crossValidation(folds, self.X.shape[0])

        svm.normModelSelect = True 
        svm.setKernel("gaussian")
        learner = svm.getBestLearner(errors, paramDict, self.X, self.y, idx)
        
        bestC = learner.getC()
        
        #Find the best norm 
        bestInds = numpy.unravel_index(numpy.argmin(errors), errors.shape)
        learner.setC(svm.getCs()[bestInds[0]])
        learner.setGamma(svm.getGammas()[bestInds[1]])              
        
        norms = []
        for trainInds, testInds in idx: 
            validX = self.X[trainInds, :]
            validY = self.y[trainInds]
            learner.learnModel(validX, validY)
            
            norms.append(learner.weightNorm())  
        
        bestNorm = numpy.array(norms).mean()
        
        norms = numpy.zeros(paramDict["setC"].shape[0]) 
        for i, C in enumerate(paramDict["setC"]): 
            learner.setC(C)
            learner.learnModel(self.X, self.y)
            norms[i] = learner.weightNorm()            
            
        bestC2 = paramDict["setC"][numpy.abs(norms-bestNorm).argmin()]
        
        self.assertEquals(bestC, bestC2)
示例#43
0
    def profileLearnModel2(self):
        #Profile stochastic case
        #X = DatasetUtils.flixster()
        #X = Sampling.sampleUsers(X, 1000)
        X, U, V = DatasetUtils.syntheticDataset1(u=0.001, m=10000, n=1000)

        rho = 0.00
        u = 0.2
        w = 1 - u
        eps = 10**-6
        alpha = 0.5
        k = self.k
        maxLocalAuc = MaxLocalAUC(k, w, alpha=alpha, eps=eps, stochastic=True)
        maxLocalAuc.numRowSamples = 2
        maxLocalAuc.numAucSamples = 10
        maxLocalAuc.maxIterations = 1
        maxLocalAuc.numRecordAucSamples = 100
        maxLocalAuc.recordStep = 10
        maxLocalAuc.initialAlg = "rand"
        maxLocalAuc.rate = "optimal"
        #maxLocalAuc.parallelSGD = True

        trainTestX = Sampling.shuffleSplitRows(X, maxLocalAuc.folds, 5)
        trainX, testX = trainTestX[0]

        def run():
            U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel(
                trainX, True)
            #logging.debug("Train Precision@5=" + str(MCEvaluator.precisionAtK(trainX, U, V, 5)))
            #logging.debug("Train Precision@10=" + str(MCEvaluator.precisionAtK(trainX, U, V, 10)))
            #logging.debug("Train Precision@20=" + str(MCEvaluator.precisionAtK(trainX, U, V, 20)))
            #logging.debug("Train Precision@50=" + str(MCEvaluator.precisionAtK(trainX, U, V, 50)))

            #logging.debug("Test Precision@5=" + str(MCEvaluator.precisionAtK(testX, U, V, 5)))
            #logging.debug("Test Precision@10=" + str(MCEvaluator.precisionAtK(testX, U, V, 10)))
            #logging.debug("Test Precision@20=" + str(MCEvaluator.precisionAtK(testX, U, V, 20)))
            #logging.debug("Test Precision@50=" + str(MCEvaluator.precisionAtK(testX, U, V, 50)))

        ProfileUtils.profile('run()', globals(), locals())
def processParkinsonsDataset(name, numRealisations):
    numpy.random.seed(21)
    dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/"
    fileName = dataDir + name + ".data"
    

    XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1)
    inds = list(set(range(XY.shape[1])) - set([5, 6]))
    X = XY[:, inds]

    y1 = XY[:, 5]
    y2 = XY[:, 6]
    #We don't keep whole collections of patients
    split = 0.5

    idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split)

    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/"
    preprocessSave(X, y1, outputDir, idx)
    
    outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/"
    preprocessSave(X, y2, outputDir, idx)
示例#45
0
def main():
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" 
    data = numpy.loadtxt(matrixFileName)
    X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row")
    X[data[:, 0]-1, data[:, 1]-1] = numpy.array(data[:, 2]>3, numpy.int)
    logging.debug("Read file: " + matrixFileName)
    logging.debug("Shape of data: " + str(X.shape))
    logging.debug("Number of non zeros " + str(X.nnz))
    
    u = 0.1 
    w = 1-u
    (m, n) = X.shape

    validationSize = 5
    trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize)
    trainX, testX = trainTestXs[0]
    trainX = trainX.toScipyCsr()

    learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001)
    learner.learnModel(trainX)
示例#46
0
def main():
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data"
    data = numpy.loadtxt(matrixFileName)
    X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])),
                     storagetype="row")
    X[data[:, 0] - 1, data[:, 1] - 1] = numpy.array(data[:, 2] > 3, numpy.int)
    logging.debug("Read file: " + matrixFileName)
    logging.debug("Shape of data: " + str(X.shape))
    logging.debug("Number of non zeros " + str(X.nnz))

    u = 0.1
    w = 1 - u
    (m, n) = X.shape

    validationSize = 5
    trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize)
    trainX, testX = trainTestXs[0]
    trainX = trainX.toScipyCsr()

    learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001)
    learner.learnModel(trainX)
示例#47
0
    def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5):
        """
        Choose parameters based on a single matrix X. We do cross validation
        within, and set parameters according to the mean squared error.
        Return nothing.
        """
        logging.debug("Performing model selection")

        # usefull
        X = X.tocoo()
        gc.collect()
        nK = len(ks)
        nLmbda = len(lmbdas)
        nGamma = len(gammas)
        nLG = nLmbda * nGamma
        errors = scipy.zeros((nK, nLmbda, nGamma, nFolds))

        # generate cross validation sets
        cvInds = Sampling.randCrossValidation(nFolds, X.nnz)

        # compute error for each fold / setting
        for icv, (trainInds, testInds) in enumerate(cvInds):
            Util.printIteration(icv, 1, nFolds, "Fold: ")

            trainX = SparseUtils.submatrix(X, trainInds)
            testX = SparseUtils.submatrix(X, testInds)

            assert trainX.nnz == trainInds.shape[0]
            assert testX.nnz == testInds.shape[0]
            nptst.assert_array_almost_equal((testX + trainX).data, X.data)

            paramList = []

            for ik, k in enumerate(ks):
                for ilmbda, lmbda in enumerate(lmbdas):
                    for igamma, gamma in enumerate(gammas):
                        paramList.append(
                            (trainX, testX, k, lmbda, gamma, maxNTry))

            # ! Remark !
            # we can parallelize the run of parameters easely.
            # parallelize the run of cv-folds is not done as it is much more
            # memory-consuming

            # parallel version (copied from IteraticeSoftImpute, but not tested)
            #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10)
            #results = pool.imap(self.learnPredict, paramList)
            #pool.terminate()

            # non-parallel version
            results = scipy.array(
                list(itertools.starmap(self.learnPredict, paramList)))

            errors[:, :, :, icv] = scipy.array(results).reshape(
                (nK, nLmbda, nGamma))

        # compute cross validation error for each setting
        errors[errors == float("inf")] = errors[errors != float("inf")].max()
        errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(
            numpy.isnan(errors))])
        meanErrors = errors.mean(3)
        stdErrors = errors.std(3)
        logging.debug("Mean errors given (k, lambda, gamma):")
        logging.debug(meanErrors)
        logging.debug("... with standard deviation:")
        logging.debug(stdErrors)

        # keep the best
        iMin = meanErrors.argmin()
        kMin = ks[int(scipy.floor(iMin / (nLG)))]
        lmbdaMin = lmbdas[int(scipy.floor((iMin % nLG) / nGamma))]
        gammaMin = gammas[int(scipy.floor(iMin % nGamma))]
        logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " +
                      str(lmbdaMin) + ", " + str(gammaMin) + ")")
        logging.debug("min = " +
                      str(meanErrors[int(scipy.floor(iMin / (nLG))),
                                     int(scipy.floor((iMin % nLG) / nGamma)),
                                     int(scipy.floor(iMin % nGamma))]))

        self.baseLearner.k = kMin
        self.baseLearner.lmbda = lmbdaMin
        self.baseLearner.gamma = gammaMin

        return
示例#48
0
    def modelSelect(self, X, colProbs=None):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        #cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        trainTestXs = Sampling.shuffleSplitRows(X,
                                                self.folds,
                                                self.validationSize,
                                                colProbs=colProbs)
        testMetrics = numpy.zeros(
            (self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs)))

        if self.metric == "mrr":
            evaluationMethod = computeTestMRR
        elif self.metric == "f1":
            evaluationMethod = computeTestF1
        else:
            raise ValueError("Invalid metric: " + self.metric)

        logging.debug("Performing model selection")
        paramList = []

        for i, k in enumerate(self.ks):
            for j, lmbda in enumerate(self.lmbdas):
                for icv, (trainX, testX) in enumerate(trainTestXs):
                    learner = self.copy()
                    learner.k = k
                    learner.lmbda = lmbda

                    paramList.append(
                        (trainX.toScipyCsr(), testX.toScipyCsr(), learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList,
                                        self.chunkSize)
        else:
            import itertools
            resultsIterator = itertools.imap(evaluationMethod, paramList)

        for i, k in enumerate(self.ks):
            for j, lmbda in enumerate(self.lmbdas):
                for icv in range(len(trainTestXs)):
                    testMetrics[i, j, icv] = resultsIterator.next()

        if self.numProcesses != 1:
            pool.terminate()

        meanTestMetrics = numpy.mean(testMetrics, 2)
        stdTestMetrics = numpy.std(testMetrics, 2)

        logging.debug("ks=" + str(self.ks))
        logging.debug("lmbdas=" + str(self.lmbdas))
        logging.debug("Mean metrics=" + str(meanTestMetrics))

        self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics),
                                             meanTestMetrics.shape)[0]]
        self.lmbda = self.lmbdas[numpy.unravel_index(
            numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]]

        logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" +
                      str(self.lmbda))

        return meanTestMetrics, stdTestMetrics
示例#49
0
    def learningRate(self, X, y, foldsSet, paramDict):
        """
        Find a matrix beta which has the same dimensions as the parameter grid. 
        Each value in the grid represents the learning rate with respect to 
        those particular parameters.         
        
        :param X: The examples as rows
        :type X: :class:`numpy.ndarray`

        :param y: The binary -1/+1 labels 
        :type y: :class:`numpy.ndarray`

        :param foldsSet: A list of folds to try. 

        :param paramDict: A dictionary index by the method name and with value as an array of values
        :type X: :class:`dict`
        """
        try:
            from sklearn import linear_model
        except ImportError:
            raise

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        betaGrid = numpy.ones(tuple(gridSize))

        gridSize.insert(0, foldsSet.shape[0])
        penalties = numpy.zeros(tuple(gridSize))
        Cvs = numpy.array([1])

        for i in range(foldsSet.shape[0]):
            folds = foldsSet[i]
            logging.debug("Folds " + str(folds))

            idx = Sampling.crossValidation(folds, X.shape[0])
            resultsList = self.parallelPen(X, y, idx, paramDict, Cvs)
            bestLearner, trainErrors, currentPenalties = resultsList[0]
            penalties[i, :] = currentPenalties

        indexIter = itertools.product(*gridInds)

        for inds in indexIter:
            inds2 = [slice(0, penalties.shape[0])]
            inds2.extend(inds)
            inds2 = tuple(inds2)
            tempPenalties = penalties[inds2]

            penInds = numpy.logical_and(numpy.isfinite(tempPenalties),
                                        tempPenalties > 0)
            penInds = numpy.squeeze(penInds)
            tempPenalties = tempPenalties[penInds].flatten()
            tempfoldsSet = numpy.array(foldsSet, numpy.float)[penInds]

            if tempPenalties.shape[0] > 1:
                xp = numpy.log((tempfoldsSet - 1) / tempfoldsSet * X.shape[0])
                yp = numpy.log(tempPenalties) + numpy.log(tempfoldsSet)

                clf = linear_model.LinearRegression()
                clf.fit(numpy.array([xp]).T, yp)
                betaGrid[inds] = clf.coef_[0]

        return -betaGrid
示例#50
0
    def modelSelect(self, X, colProbs=None):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        trainTestXs = Sampling.shuffleSplitRows(X,
                                                self.folds,
                                                self.validationSize,
                                                csarray=True,
                                                colProbs=colProbs)
        testMetrics = numpy.zeros(
            (self.ks.shape[0], self.lmbdaUsers.shape[0],
             self.lmbdaItems.shape[0], self.gammas.shape[0], len(trainTestXs)))

        logging.debug(
            "Performing model selection with test leave out per row of " +
            str(self.validationSize))
        paramList = []

        for i, k in enumerate(self.ks):
            for j, lmbdaUser in enumerate(self.lmbdaUsers):
                for s, lmbdaItem in enumerate(self.lmbdaItems):
                    for t, gamma in enumerate(self.gammas):
                        for icv, (trainX, testX) in enumerate(trainTestXs):
                            learner = self.copy()
                            learner.k = k
                            learner.lmbdaUser = lmbdaUser
                            learner.lmbdaPos = lmbdaItem
                            learner.lmbdaNeg = lmbdaItem
                            learner.gamma = gamma

                            paramList.append((trainX, testX, learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=100)
            resultsIterator = pool.imap(computeTestF1, paramList,
                                        self.chunkSize)
        else:
            import itertools
            resultsIterator = itertools.imap(computeTestF1, paramList)

        for i, k in enumerate(self.ks):
            for j, lmbdaUser in enumerate(self.lmbdaUsers):
                for s, lmbdaPos in enumerate(self.lmbdaItems):
                    for t, gamma in enumerate(self.gammas):
                        for icv, (trainX, testX) in enumerate(trainTestXs):
                            testMetrics[i, j, s, t,
                                        icv] = resultsIterator.next()

        if self.numProcesses != 1:
            pool.terminate()

        meanTestMetrics = numpy.mean(testMetrics, 4)
        stdTestMetrics = numpy.std(testMetrics, 4)

        logging.debug("ks=" + str(self.ks))
        logging.debug("lmbdaUsers=" + str(self.lmbdaUsers))
        logging.debug("lmbdaItems=" + str(self.lmbdaItems))
        logging.debug("gammas=" + str(self.gammas))
        logging.debug("Mean metrics=" + str(meanTestMetrics))

        indK, indLmdabUser, indLmbdaItem, indGamma = numpy.unravel_index(
            meanTestMetrics.argmax(), meanTestMetrics.shape)
        self.k = self.ks[indK]
        self.lmbdaUser = self.lmbdaUsers[indLmdabUser]
        self.lmbdaPos = self.lmbdaItems[indLmbdaItem]
        self.lmbdaNeg = self.lmbdaItems[indLmbdaItem]
        self.gamma = self.gammas[indGamma]

        logging.debug("Model parameters: " + str(self))

        return meanTestMetrics, stdTestMetrics