def testCrossValidation(self): numExamples = 10 folds = 2 indices = Sampling.crossValidation(folds, numExamples) self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])) self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])) indices = Sampling.crossValidation(3, numExamples) self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([3, 4, 5, 6, 7, 8, 9], [0, 1, 2])) self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 6, 7, 8, 9], [3, 4, 5])) self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 5], [6, 7, 8, 9])) indices = Sampling.crossValidation(4, numExamples) self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([2, 3, 4, 5, 6, 7, 8, 9], [0, 1])) self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 5, 6, 7, 8, 9], [2, 3, 4])) self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 7, 8, 9], [5, 6])) self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 3, 4, 5, 6], [7, 8, 9])) indices = Sampling.crossValidation(numExamples, numExamples) self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0])) self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 2, 3, 4, 5, 6, 7, 8, 9], [1])) self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 3, 4, 5, 6, 7, 8, 9], [2])) self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 4, 5, 6, 7, 8, 9], [3])) self.assertEquals((list(indices[4][0]), list(indices[4][1])), ([0, 1, 2, 3, 5, 6, 7, 8, 9], [4])) self.assertRaises(ValueError, Sampling.crossValidation, numExamples+1, numExamples) self.assertRaises(ValueError, Sampling.crossValidation, 0, numExamples) self.assertRaises(ValueError, Sampling.crossValidation, -1, numExamples) self.assertRaises(ValueError, Sampling.crossValidation, folds, 1)
def testSampleUsers(self): m = 10 n = 15 r = 5 u = 0.3 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) k = 50 X2, userInds = Sampling.sampleUsers(X, k) nptst.assert_array_equal(X.toarray(), X2.toarray()) numRuns = 50 for i in range(numRuns): m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) k = numpy.random.randint(10, 100) X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) X2, userInds = Sampling.sampleUsers(X, k) self.assertEquals(X2.shape[0], min(k, m)) self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all())) self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all()) self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
def testRepCrossValidation(self): numExamples = 10 folds = 3 repetitions = 1 indices = Sampling.repCrossValidation(folds, numExamples, repetitions) for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all()) repetitions = 2 indices = Sampling.repCrossValidation(folds, numExamples, repetitions) for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
def testParallelVfPenRbf2(self): #Test support vector regression folds = 3 Cv = numpy.array([4.0]) idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") svm.setSvmType("Epsilon_SVR") resultsList = svm.parallelVfPenRbf(self.X, self.y, idx, Cv, type="Epsilon_SVR") tol = 10**-6 bestError = 100 meanErrors2 = numpy.zeros( (svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] for k in range(svm.epsilons.shape[0]): epsilon = svm.epsilons[k] penalty = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] svm.setGamma(gamma) svm.setC(C) svm.setEpsilon(epsilon) svm.learnModel(trainX, trainY) predY = svm.predict(self.X) predTrainY = svm.predict(trainX) penalty += svm.getMetricMethod()( predY, self.y) - svm.getMetricMethod()(predTrainY, trainY) penalty = penalty * Cv[0] / len(idx) svm.learnModel(self.X, self.y) predY = svm.predict(self.X) meanErrors2[j, k, i] = svm.getMetricMethod()( predY, self.y) + penalty if meanErrors2[j, k, i] < bestError: bestC = C bestGamma = gamma bestEpsilon = epsilon bestError = meanErrors2[j, k, i] bestSVM, trainErrors, currentPenalties = resultsList[0] meanErrors = trainErrors + currentPenalties self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertEquals(bestEpsilon, bestSVM.getEpsilon()) self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
def testParallelPen(self): #Check if penalisation == inf when treeSize < gamma numExamples = 100 X, y = data.make_regression(numExamples) learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2) paramDict = {} paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 10, 0.5)-1), dtype=numpy.int) folds = 3 alpha = 1.0 Cvs = numpy.array([(folds-1)*alpha]) idx = Sampling.crossValidation(folds, X.shape[0]) resultsList = learner.parallelPen(X, y, idx, paramDict, Cvs) learner, trainErrors, currentPenalties = resultsList[0] learner.setGamma(2**10) treeSize = 0 #Let's work out the size of the unpruned tree for trainInds, testInds in idx: trainX = X[trainInds, :] trainY = y[trainInds] learner.learnModel(trainX, trainY) treeSize += learner.tree.size treeSize /= float(folds) self.assertTrue(numpy.isinf(currentPenalties[paramDict["setGamma"]>treeSize]).all()) self.assertTrue(not numpy.isinf(currentPenalties[paramDict["setGamma"]<treeSize]).all())
def profileModelSelect(self): lmbdas = numpy.linspace(1.0, 0.01, 5) softImpute = IterativeSoftImpute(k=500) folds = 5 cvInds = Sampling.randCrossValidation(folds, self.X.nnz) ProfileUtils.profile('softImpute.modelSelect(self.X, lmbdas, cvInds)', globals(), locals())
def testAverageRocCurve(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix( (m, n), k, w, csarray=True, verbose=True, indsPerRow=200 ) fpr, tpr = MCEvaluator.averageRocCurve(X, U, V) import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt # plt.plot(fpr, tpr) # plt.show() # Now try case where we have a training set folds = 1 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
def testAverageRocCurve(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True, verbose=True, indsPerRow=200) fpr, tpr = MCEvaluator.averageRocCurve(X, U, V) import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt #plt.plot(fpr, tpr) #plt.show() #Now try case where we have a training set folds = 1 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
def cvModelSelection(self, graph, paramList, paramFunc, folds, errorFunc): """ ParamList is a list of lists of parameters and paramFunc is a list of the corresponding functions to call with the parameters as arguments. Note that a parameter can also be a tuple which is expanded out before the function is called. e.g. paramList = [[1, 2], [2, 1], [12, 1]] paramFunc = [predictor.setC, predictor.setD] """ inds = Sampling.crossValidation(folds, graph.getNumEdges()) errors = numpy.zeros((len(paramList), folds)) allEdges = graph.getAllEdges() for i in range(len(paramList)): paramSet = paramList[i] logging.debug("Using paramSet=" + str(paramSet)) for j in range(len(paramSet)): if type(paramSet[j]) == tuple: paramFunc[j](*paramSet[j]) else: paramFunc[j](paramSet[j]) predY = numpy.zeros(0) y = numpy.zeros(0) j = 0 for (trainInds, testInds) in inds: trainEdges = allEdges[trainInds, :] testEdges = allEdges[testInds, :] trainGraph = SparseGraph(graph.getVertexList(), graph.isUndirected()) trainGraph.addEdges(trainEdges, graph.getEdgeValues(trainEdges)) testGraph = SparseGraph(graph.getVertexList(), graph.isUndirected()) testGraph.addEdges(testEdges, graph.getEdgeValues(testEdges)) self.learnModel(trainGraph) predY = self.predictEdges(testGraph, testGraph.getAllEdges()) y = testGraph.getEdgeValues(testGraph.getAllEdges()) #Note that the order the edges is different in testGraphs as #opposed to graph when calling getAllEdges() errors[i, j] = errorFunc(y, predY) j = j + 1 logging.info("Error of current fold: " + str(numpy.mean(errors[i, :]))) meanErrors = numpy.mean(errors, 1) strErrors = numpy.std(errors, 1) return meanErrors, strErrors
def testParallelPen(self): folds = 3 Cv = numpy.array([4.0]) idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv) tol = 10**-6 bestError = 1 trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] penalty = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(self.X) predTrainY = svm.predict(trainX) penalty += Evaluator.binaryError( predY, self.y) - Evaluator.binaryError( predTrainY, trainY) penalty = penalty * Cv[0] / len(idx) svm.learnModel(self.X, self.y) predY = svm.predict(self.X) trainErrors2[i, j] = Evaluator.binaryError(predY, self.y) penalties2[i, j] = penalty meanErrors2[i, j] = Evaluator.binaryError(predY, self.y) + penalty if meanErrors2[i, j] < bestError: bestC = C bestGamma = gamma bestError = meanErrors2[i, j] bestSVM, trainErrors, currentPenalties = resultsList[0] meanErrors = trainErrors + currentPenalties self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol) self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol) self.assertTrue( numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
def parallelGridSearch(self, X, paramDict, evaluationMethod, testX=None, minVal=True): """ Perform parallel model selection using any learner. """ logging.debug("Parallel grid search with params: " + str(paramDict)) m, n = X.shape if testX == None: trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize) else: trainTestXs = [[X, testX]] gridSize = [] gridInds = [] for key in paramDict.keys(): gridSize.append(paramDict[key].shape[0]) gridInds.append(numpy.arange(paramDict[key].shape[0])) meanMetrics = numpy.zeros(tuple(gridSize)) paramList = [] for icv, (trainX, testX) in enumerate(trainTestXs): indexIter = itertools.product(*gridInds) for inds in indexIter: learner = self.copy() for i, (key, val) in enumerate(paramDict.items()): setattr(learner, key, val[inds[i]]) paramList.append((trainX, testX, learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize) else: resultsIterator = itertools.imap(evaluationMethod, paramList) for icv, (trainX, testX) in enumerate(trainTestXs): indexIter = itertools.product(*gridInds) for inds in indexIter: metric = resultsIterator.next() meanMetrics[inds] += metric / float(self.folds) if self.numProcesses != 1: pool.terminate() resultDict, bestMetric = self.setBestLearner(meanMetrics, paramDict, minVal) return meanMetrics
def testShuffleSplit(self): numExamples = 10 folds = 5 indices = Sampling.shuffleSplit(folds, numExamples) for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all()) indices = Sampling.shuffleSplit(folds, numExamples, 0.5) trainSize = numExamples*0.5 for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all()) self.assertTrue(indices[i][0].shape[0] == trainSize) indices = Sampling.shuffleSplit(folds, numExamples, 0.55)
def testSampleUsers2(self): m = 10 n = 15 r = 5 u = 0.3 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) k = X.nnz+100 X2, userInds = Sampling.sampleUsers2(X, k) nptst.assert_array_equal(X.toarray(), X2.toarray()) #Test pruning of cols k = 500 m = 100 n = 500 u = 0.1 w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) numpy.random.seed(21) X2, userInds = Sampling.sampleUsers2(X, k, prune=True) nnz1 = X2.nnz self.assertTrue((X2.sum(0)!=0).all()) numpy.random.seed(21) X2, userInds = Sampling.sampleUsers2(X, k, prune=False) nnz2 = X2.nnz self.assertEquals(nnz1, nnz2) numRuns = 50 for i in range(numRuns): m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) k = 500 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) X2, userInds = Sampling.sampleUsers2(X, k) self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all())) self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all()) self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
def cvPrune(self, validX, validY): """ We do something like reduced error pruning but we use cross validation to decide which nodes to prune. """ #First set the value of the vertices using the training set. #Reset all alphas to zero inds = Sampling.crossValidation(self.folds, validX.shape[0]) for i in self.tree.getAllVertexIds(): self.tree.getVertex(i).setAlpha(0.0) self.tree.getVertex(i).setTestError(0.0) for trainInds, testInds in inds: rootId = (0,) root = self.tree.getVertex(rootId) root.setTrainInds(trainInds) root.setTestInds(testInds) root.tempValue = numpy.mean(validY[trainInds]) nodeStack = [(rootId, root.tempValue)] while len(nodeStack) != 0: (nodeId, value) = nodeStack.pop() node = self.tree.getVertex(nodeId) tempTrainInds = node.getTrainInds() tempTestInds = node.getTestInds() node.setTestError(numpy.sum((validY[tempTestInds] - node.tempValue)**2) + node.getTestError()) childIds = [self.getLeftChildId(nodeId), self.getRightChildId(nodeId)] for childId in childIds: if self.tree.vertexExists(childId): child = self.tree.getVertex(childId) if childId[-1] == 0: childInds = validX[tempTrainInds, node.getFeatureInd()] < node.getThreshold() else: childInds = validX[tempTrainInds, node.getFeatureInd()] >= node.getThreshold() if childInds.sum() !=0: value = numpy.mean(validY[tempTrainInds[childInds]]) child.tempValue = value child.setTrainInds(tempTrainInds[childInds]) nodeStack.append((childId, value)) if childId[-1] == 0: childInds = validX[tempTestInds, node.getFeatureInd()] < node.getThreshold() else: childInds = validX[tempTestInds, node.getFeatureInd()] >= node.getThreshold() child.setTestInds(tempTestInds[childInds]) self.computeAlphas() self.prune()
def cvModelSelection(self, graph, paramList, paramFunc, folds, errorFunc): """ ParamList is a list of lists of parameters and paramFunc is a list of the corresponding functions to call with the parameters as arguments. Note that a parameter can also be a tuple which is expanded out before the function is called. e.g. paramList = [[1, 2], [2, 1], [12, 1]] paramFunc = [predictor.setC, predictor.setD] """ inds = Sampling.crossValidation(folds, graph.getNumEdges()) errors = numpy.zeros((len(paramList), folds)) allEdges = graph.getAllEdges() for i in range(len(paramList)): paramSet = paramList[i] logging.debug("Using paramSet=" + str(paramSet)) for j in range(len(paramSet)): if type(paramSet[j]) == tuple: paramFunc[j](*paramSet[j]) else: paramFunc[j](paramSet[j]) predY = numpy.zeros(0) y = numpy.zeros(0) j = 0 for (trainInds, testInds) in inds: trainEdges = allEdges[trainInds, :] testEdges = allEdges[testInds, :] trainGraph = SparseGraph(graph.getVertexList(), graph.isUndirected()) trainGraph.addEdges(trainEdges, graph.getEdgeValues(trainEdges)) testGraph = SparseGraph(graph.getVertexList(), graph.isUndirected()) testGraph.addEdges(testEdges, graph.getEdgeValues(testEdges)) self.learnModel(trainGraph) predY = self.predictEdges(testGraph, testGraph.getAllEdges()) y = testGraph.getEdgeValues(testGraph.getAllEdges()) #Note that the order the edges is different in testGraphs as #opposed to graph when calling getAllEdges() errors[i, j] = errorFunc(y, predY) j = j+1 logging.info("Error of current fold: " + str(numpy.mean(errors[i, :]))) meanErrors = numpy.mean(errors, 1) strErrors = numpy.std(errors, 1) return meanErrors, strErrors
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape #cvInds = Sampling.randCrossValidation(self.folds, X.nnz) trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs) testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs))) if self.metric == "mrr": evaluationMethod = computeTestMRR elif self.metric == "f1": evaluationMethod = computeTestF1 else: raise ValueError("Invalid metric: " + self.metric) logging.debug("Performing model selection") paramList = [] for i, k in enumerate(self.ks): for j, lmbda in enumerate(self.lmbdas): for icv, (trainX, testX) in enumerate(trainTestXs): learner = self.copy() learner.k = k learner.lmbda = lmbda paramList.append((trainX.toScipyCsr(), testX.toScipyCsr(), learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize) else: import itertools resultsIterator = itertools.imap(evaluationMethod, paramList) for i, k in enumerate(self.ks): for j, lmbda in enumerate(self.lmbdas): for icv in range(len(trainTestXs)): testMetrics[i, j, icv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics= numpy.mean(testMetrics, 2) stdTestMetrics = numpy.std(testMetrics, 2) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdas=" + str(self.lmbdas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]] self.lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]] logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda)) return meanTestMetrics, stdTestMetrics
def testBootstrap2(self): numExamples = 10 folds = 2 indices = Sampling.bootstrap2(folds, numExamples) for i in range(folds): self.assertEquals(indices[i][0].shape[0], numExamples) self.assertTrue(indices[i][1].shape[0] < numExamples) self.assertTrue((numpy.union1d(indices[0][0], indices[0][1]) == numpy.arange(numExamples)).all())
def testParallelPen(self): folds = 3 Cv = numpy.array([4.0]) idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv) tol = 10**-6 bestError = 1 trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] penalty = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(self.X) predTrainY = svm.predict(trainX) penalty += Evaluator.binaryError(predY, self.y) - Evaluator.binaryError(predTrainY, trainY) penalty = penalty*Cv[0]/len(idx) svm.learnModel(self.X, self.y) predY = svm.predict(self.X) trainErrors2[i, j] = Evaluator.binaryError(predY, self.y) penalties2[i, j] = penalty meanErrors2[i, j] = Evaluator.binaryError(predY, self.y) + penalty if meanErrors2[i, j] < bestError: bestC = C bestGamma = gamma bestError = meanErrors2[i, j] bestSVM, trainErrors, currentPenalties = resultsList[0] meanErrors = trainErrors + currentPenalties self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol) self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol) self.assertTrue(numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def generateLearner(self, X, y): """ Train using the given examples and labels, and use model selection to find the best parameters. """ if numpy.unique(y).shape[0] != 2: print(y) raise ValueError("Can only operate on binary data") #Do model selection first if self.sampleSize == None: idx = Sampling.crossValidation(self.folds, X.shape[0]) learner, meanErrors = self.parallelModelSelect(X, y, idx, self.paramDict) else: idx = Sampling.crossValidation(self.folds, self.sampleSize) inds = numpy.random.permutation(X.shape[0])[0:self.sampleSize] learner, meanErrors = self.parallelModelSelect(X[inds, :], y[inds], idx, self.paramDict) learner = self.getBestLearner(meanErrors, self.paramDict, X, y) return learner
def getDataset(dataset, nnz=20000): """ Return a dataset by name """ if dataset == "synthetic": X, U, V = DatasetUtils.syntheticDataset1() elif dataset == "synthetic2": X = DatasetUtils.syntheticDataset2() elif dataset == "movielens": X = DatasetUtils.movieLens() elif dataset == "epinions": X = DatasetUtils.epinions() X, userInds = Sampling.sampleUsers2(X, nnz, prune=True) elif dataset == "flixster": X = DatasetUtils.flixster() X, userInds = Sampling.sampleUsers2(X, nnz, prune=True) else: raise ValueError("Unknown dataset: " + dataset) return X
def evaluateCv(self, X, y, folds, metricMethod=Evaluator.binaryError): """ Compute the cross validation according to a given metric. """ Parameter.checkInt(folds, 2, float('inf')) idx = Sampling.crossValidation(folds, y.shape[0]) metrics = AbstractPredictor.evaluateLearn(X, y, idx, self.learnModel, self.predict, metricMethod) mean = numpy.mean(metrics, 0) var = numpy.var(metrics, 0) return (mean, var)
def testParallelVfPenRbf2(self): #Test support vector regression folds = 3 Cv = numpy.array([4.0]) idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") svm.setSvmType("Epsilon_SVR") resultsList = svm.parallelVfPenRbf(self.X, self.y, idx, Cv, type="Epsilon_SVR") tol = 10**-6 bestError = 100 meanErrors2 = numpy.zeros((svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] for k in range(svm.epsilons.shape[0]): epsilon = svm.epsilons[k] penalty = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] svm.setGamma(gamma) svm.setC(C) svm.setEpsilon(epsilon) svm.learnModel(trainX, trainY) predY = svm.predict(self.X) predTrainY = svm.predict(trainX) penalty += svm.getMetricMethod()(predY, self.y) - svm.getMetricMethod()(predTrainY, trainY) penalty = penalty*Cv[0]/len(idx) svm.learnModel(self.X, self.y) predY = svm.predict(self.X) meanErrors2[j, k, i] = svm.getMetricMethod()(predY, self.y) + penalty if meanErrors2[j, k, i] < bestError: bestC = C bestGamma = gamma bestEpsilon = epsilon bestError = meanErrors2[j, k, i] bestSVM, trainErrors, currentPenalties = resultsList[0] meanErrors = trainErrors + currentPenalties self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertEquals(bestEpsilon, bestSVM.getEpsilon()) self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
def testParallelPenaltyGrid(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) randomForest = RandomForest() trainX = self.X[0:40, :] trainY = self.y[0:40] paramDict = {} paramDict["setMinSplit"] = randomForest.getMinSplits() paramDict["setMaxDepth"] = randomForest.getMaxDepths() idealPenalties = randomForest.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ext print("Loading data from file " + fileName) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/" XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters) X = XY[:, :-1] y = XY[:, -1] idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) preprocessSave(X, y, outputDir, idx)
def generateLearner(self, X, y): """ Train using the given examples and labels, and use model selection to find the best parameters. """ if numpy.unique(y).shape[0] != 2: print(y) raise ValueError("Can only operate on binary data") #Do model selection first if self.sampleSize == None: idx = Sampling.crossValidation(self.folds, X.shape[0]) learner, meanErrors = self.parallelModelSelect( X, y, idx, self.paramDict) else: idx = Sampling.crossValidation(self.folds, self.sampleSize) inds = numpy.random.permutation(X.shape[0])[0:self.sampleSize] learner, meanErrors = self.parallelModelSelect( X[inds, :], y[inds], idx, self.paramDict) learner = self.getBestLearner(meanErrors, self.paramDict, X, y) return learner
def testParallelPenaltyGrid(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) randomForest = RandomForest() trainX = self.X[0:40, :] trainY = self.y[0:40] paramDict = {} paramDict["setMinSplit"] = randomForest.getMinSplits() paramDict["setMaxDepth"] = randomForest.getMaxDepths() idealPenalties = randomForest.parallelPenaltyGrid( trainX, trainY, self.X, self.y, paramDict)
def testParallelVfcvRbf2(self): #In this test we try SVM regression folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") svm.setSvmType("Epsilon_SVR") bestSVM, meanErrors = svm.parallelVfcvRbf(self.X, self.y, idx, type="Epsilon_SVR") tol = 10**-6 bestError = 100 meanErrors2 = numpy.zeros( (svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] for k in range(svm.epsilons.shape[0]): epsilon = svm.epsilons[k] error = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] testX = self.X[testInds, :] testY = self.y[testInds] svm.setGamma(gamma) svm.setC(C) svm.setEpsilon(epsilon) svm.learnModel(trainX, trainY) predY = svm.predict(testX) error += svm.getMetricMethod()(predY, testY) meanErrors2[j, k, i] = error / len(idx) if error < bestError: bestC = C bestGamma = gamma bestError = error bestEpsilon = epsilon self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertEquals(bestEpsilon, bestSVM.getEpsilon()) self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
def testParallelModelSelect(self): X = scipy.sparse.rand(10, 10, 0.5) X = X.tocsr() numExamples = X.getnnz() paramDict = {} paramDict["setRank"] = numpy.array([5, 10, 20]) folds = 3 idx = Sampling.randCrossValidation(folds, numExamples) method = "lsnmf" nimfaFactorise = NimfaFactorise(method) learner, meanErrors = nimfaFactorise.parallelModelSelect( X, idx, paramDict)
def testParallelModelSelect(self): X = scipy.sparse.rand(10, 10, 0.5) X = X.tocsr() numExamples = X.getnnz() paramDict = {} paramDict["setRank"] = numpy.array([5, 10, 20]) folds = 3 idx = Sampling.randCrossValidation(folds, numExamples) method = "lsnmf" nimfaFactorise = NimfaFactorise(method) learner, meanErrors = nimfaFactorise.parallelModelSelect(X, idx, paramDict)
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def testParallelPenaltyGrid(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) decisionTree = DecisionTree() bestLearner, meanErrors = decisionTree.parallelVfcv(self.X, self.y, idx) trainX = self.X[0:40, :] trainY = self.y[0:40] paramDict = {} paramDict["setMinSplit"] = decisionTree.getMinSplits() paramDict["setMaxDepth"] = decisionTree.getMaxDepths() idealPenalties = decisionTree.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) precisions = numpy.zeros((self.ks.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): learner = self.copy() learner.k = k paramList.append((trainX, testX, testOmegaList, learner)) #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize) import itertools resultsIterator = itertools.imap(computePrecision, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempPrecision = resultsIterator.next() precisions[i, icv] = tempPrecision #pool.terminate() meanPrecisions = numpy.mean(precisions, 1) stdPrecisions = numpy.std(precisions, 1) logging.debug(meanPrecisions) k = self.ks[numpy.argmax(meanPrecisions)] logging.debug("Model parameters: k=" + str(k)) self.k = k return meanPrecisions, stdPrecisions
def testModelSelect(self): lmbda = 0.1 shape = (20, 20) r = 20 numInds = 100 noise = 0.2 X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise) U, s, V = numpy.linalg.svd(X.todense()) k = 15 iterativeSoftImpute = IterativeSoftImpute(lmbda, k=None, svdAlg="propack", updateAlg="zero") iterativeSoftImpute.numProcesses = 1 rhos = numpy.linspace(0.5, 0.001, 20) ks = numpy.array([k], numpy.int) folds = 3 cvInds = Sampling.randCrossValidation(folds, X.nnz) meanTestErrors, meanTrainErrors = iterativeSoftImpute.modelSelect(X, rhos, ks, cvInds) #Now do model selection manually (rowInds, colInds) = X.nonzero() trainErrors = numpy.zeros((rhos.shape[0], len(cvInds))) testErrors = numpy.zeros((rhos.shape[0], len(cvInds))) for i, rho in enumerate(rhos): for j, (trainInds, testInds) in enumerate(cvInds): trainX = scipy.sparse.csc_matrix(X.shape) testX = scipy.sparse.csc_matrix(X.shape) for p in trainInds: trainX[rowInds[p], colInds[p]] = X[rowInds[p], colInds[p]] for p in testInds: testX[rowInds[p], colInds[p]] = X[rowInds[p], colInds[p]] softImpute = SoftImpute(numpy.array([rho]), k=ks[0]) ZList = [softImpute.learnModel(trainX, fullMatrices=False)] predTrainX = softImpute.predict(ZList, trainX.nonzero())[0] predX = softImpute.predict(ZList, testX.nonzero())[0] testErrors[i, j] = MCEvaluator.rootMeanSqError(testX, predX) trainErrors[i, j] = MCEvaluator.rootMeanSqError(trainX, predTrainX) meanTestErrors2 = testErrors.mean(1) meanTrainErrors2 = trainErrors.mean(1) nptst.assert_array_almost_equal(meanTestErrors.ravel(), meanTestErrors2, 1)
def testParallelVfcvRbf2(self): #In this test we try SVM regression folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") svm.setSvmType("Epsilon_SVR") bestSVM, meanErrors = svm.parallelVfcvRbf(self.X, self.y, idx, type="Epsilon_SVR") tol = 10**-6 bestError = 100 meanErrors2 = numpy.zeros((svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0])) for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] for k in range(svm.epsilons.shape[0]): epsilon = svm.epsilons[k] error = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] testX = self.X[testInds, :] testY = self.y[testInds] svm.setGamma(gamma) svm.setC(C) svm.setEpsilon(epsilon) svm.learnModel(trainX, trainY) predY = svm.predict(testX) error += svm.getMetricMethod()(predY, testY) meanErrors2[j, k, i] = error/len(idx) if error < bestError: bestC = C bestGamma = gamma bestError = error bestEpsilon = epsilon self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertEquals(bestEpsilon, bestSVM.getEpsilon()) self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
def testParallelModelSelect(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx, paramDict) tol = 10**-6 bestError = 1 meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) print("Computing real grid") for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] error = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] testX = self.X[testInds, :] testY = self.y[testInds] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(testX) error += Evaluator.binaryError(predY, testY) meanErrors2[i, j] = error / len(idx) if error < bestError: bestC = C bestGamma = gamma bestError = error self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
def testParallelModelSelect(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) svm = self.svm svm.setKernel("gaussian") paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx, paramDict) tol = 10**-6 bestError = 1 meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) print("Computing real grid") for i in range(svm.Cs.shape[0]): C = svm.Cs[i] for j in range(svm.gammas.shape[0]): gamma = svm.gammas[j] error = 0 for trainInds, testInds in idx: trainX = self.X[trainInds, :] trainY = self.y[trainInds] testX = self.X[testInds, :] testY = self.y[testInds] svm.setGamma(gamma) svm.setC(C) svm.learnModel(trainX, trainY) predY = svm.predict(testX) error += Evaluator.binaryError(predY, testY) meanErrors2[i, j] = error/len(idx) if error < bestError: bestC = C bestGamma = gamma bestError = error self.assertEquals(bestC, bestSVM.getC()) self.assertEquals(bestGamma, bestSVM.getGamma()) self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
def testGetBestLearner(self): svm = self.svm paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() errors = numpy.random.rand(svm.getCs().shape[0], svm.getGammas().shape[0]) folds = 5 idx = Sampling.crossValidation(folds, self.X.shape[0]) svm.normModelSelect = True svm.setKernel("gaussian") learner = svm.getBestLearner(errors, paramDict, self.X, self.y, idx) bestC = learner.getC() #Find the best norm bestInds = numpy.unravel_index(numpy.argmin(errors), errors.shape) learner.setC(svm.getCs()[bestInds[0]]) learner.setGamma(svm.getGammas()[bestInds[1]]) norms = [] for trainInds, testInds in idx: validX = self.X[trainInds, :] validY = self.y[trainInds] learner.learnModel(validX, validY) norms.append(learner.weightNorm()) bestNorm = numpy.array(norms).mean() norms = numpy.zeros(paramDict["setC"].shape[0]) for i, C in enumerate(paramDict["setC"]): learner.setC(C) learner.learnModel(self.X, self.y) norms[i] = learner.weightNorm() bestC2 = paramDict["setC"][numpy.abs(norms - bestNorm).argmin()] self.assertEquals(bestC, bestC2)
def testGetBestLearner(self): svm = self.svm paramDict = {} paramDict["setC"] = svm.getCs() paramDict["setGamma"] = svm.getGammas() errors = numpy.random.rand(svm.getCs().shape[0], svm.getGammas().shape[0]) folds = 5 idx = Sampling.crossValidation(folds, self.X.shape[0]) svm.normModelSelect = True svm.setKernel("gaussian") learner = svm.getBestLearner(errors, paramDict, self.X, self.y, idx) bestC = learner.getC() #Find the best norm bestInds = numpy.unravel_index(numpy.argmin(errors), errors.shape) learner.setC(svm.getCs()[bestInds[0]]) learner.setGamma(svm.getGammas()[bestInds[1]]) norms = [] for trainInds, testInds in idx: validX = self.X[trainInds, :] validY = self.y[trainInds] learner.learnModel(validX, validY) norms.append(learner.weightNorm()) bestNorm = numpy.array(norms).mean() norms = numpy.zeros(paramDict["setC"].shape[0]) for i, C in enumerate(paramDict["setC"]): learner.setC(C) learner.learnModel(self.X, self.y) norms[i] = learner.weightNorm() bestC2 = paramDict["setC"][numpy.abs(norms-bestNorm).argmin()] self.assertEquals(bestC, bestC2)
def profileLearnModel2(self): #Profile stochastic case #X = DatasetUtils.flixster() #X = Sampling.sampleUsers(X, 1000) X, U, V = DatasetUtils.syntheticDataset1(u=0.001, m=10000, n=1000) rho = 0.00 u = 0.2 w = 1 - u eps = 10**-6 alpha = 0.5 k = self.k maxLocalAuc = MaxLocalAUC(k, w, alpha=alpha, eps=eps, stochastic=True) maxLocalAuc.numRowSamples = 2 maxLocalAuc.numAucSamples = 10 maxLocalAuc.maxIterations = 1 maxLocalAuc.numRecordAucSamples = 100 maxLocalAuc.recordStep = 10 maxLocalAuc.initialAlg = "rand" maxLocalAuc.rate = "optimal" #maxLocalAuc.parallelSGD = True trainTestX = Sampling.shuffleSplitRows(X, maxLocalAuc.folds, 5) trainX, testX = trainTestX[0] def run(): U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel( trainX, True) #logging.debug("Train Precision@5=" + str(MCEvaluator.precisionAtK(trainX, U, V, 5))) #logging.debug("Train Precision@10=" + str(MCEvaluator.precisionAtK(trainX, U, V, 10))) #logging.debug("Train Precision@20=" + str(MCEvaluator.precisionAtK(trainX, U, V, 20))) #logging.debug("Train Precision@50=" + str(MCEvaluator.precisionAtK(trainX, U, V, 50))) #logging.debug("Test Precision@5=" + str(MCEvaluator.precisionAtK(testX, U, V, 5))) #logging.debug("Test Precision@10=" + str(MCEvaluator.precisionAtK(testX, U, V, 10))) #logging.debug("Test Precision@20=" + str(MCEvaluator.precisionAtK(testX, U, V, 20))) #logging.debug("Test Precision@50=" + str(MCEvaluator.precisionAtK(testX, U, V, 50))) ProfileUtils.profile('run()', globals(), locals())
def processParkinsonsDataset(name, numRealisations): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ".data" XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1) inds = list(set(range(XY.shape[1])) - set([5, 6])) X = XY[:, inds] y1 = XY[:, 5] y2 = XY[:, 6] #We don't keep whole collections of patients split = 0.5 idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/" preprocessSave(X, y1, outputDir, idx) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/" preprocessSave(X, y2, outputDir, idx)
def main(): import sys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" data = numpy.loadtxt(matrixFileName) X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row") X[data[:, 0]-1, data[:, 1]-1] = numpy.array(data[:, 2]>3, numpy.int) logging.debug("Read file: " + matrixFileName) logging.debug("Shape of data: " + str(X.shape)) logging.debug("Number of non zeros " + str(X.nnz)) u = 0.1 w = 1-u (m, n) = X.shape validationSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize) trainX, testX = trainTestXs[0] trainX = trainX.toScipyCsr() learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001) learner.learnModel(trainX)
def main(): import sys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" data = numpy.loadtxt(matrixFileName) X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row") X[data[:, 0] - 1, data[:, 1] - 1] = numpy.array(data[:, 2] > 3, numpy.int) logging.debug("Read file: " + matrixFileName) logging.debug("Shape of data: " + str(X.shape)) logging.debug("Number of non zeros " + str(X.nnz)) u = 0.1 w = 1 - u (m, n) = X.shape validationSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize) trainX, testX = trainTestXs[0] trainX = trainX.toScipyCsr() learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001) learner.learnModel(trainX)
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX + trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append( (trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array( list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape( (nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not( numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin / (nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin % nLG) / nGamma))] gammaMin = gammas[int(scipy.floor(iMin % nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin / (nLG))), int(scipy.floor((iMin % nLG) / nGamma)), int(scipy.floor(iMin % nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape #cvInds = Sampling.randCrossValidation(self.folds, X.nnz) trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs) testMetrics = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs))) if self.metric == "mrr": evaluationMethod = computeTestMRR elif self.metric == "f1": evaluationMethod = computeTestF1 else: raise ValueError("Invalid metric: " + self.metric) logging.debug("Performing model selection") paramList = [] for i, k in enumerate(self.ks): for j, lmbda in enumerate(self.lmbdas): for icv, (trainX, testX) in enumerate(trainTestXs): learner = self.copy() learner.k = k learner.lmbda = lmbda paramList.append( (trainX.toScipyCsr(), testX.toScipyCsr(), learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize) else: import itertools resultsIterator = itertools.imap(evaluationMethod, paramList) for i, k in enumerate(self.ks): for j, lmbda in enumerate(self.lmbdas): for icv in range(len(trainTestXs)): testMetrics[i, j, icv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testMetrics, 2) stdTestMetrics = numpy.std(testMetrics, 2) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdas=" + str(self.lmbdas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]] self.lmbda = self.lmbdas[numpy.unravel_index( numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]] logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda)) return meanTestMetrics, stdTestMetrics
def learningRate(self, X, y, foldsSet, paramDict): """ Find a matrix beta which has the same dimensions as the parameter grid. Each value in the grid represents the learning rate with respect to those particular parameters. :param X: The examples as rows :type X: :class:`numpy.ndarray` :param y: The binary -1/+1 labels :type y: :class:`numpy.ndarray` :param foldsSet: A list of folds to try. :param paramDict: A dictionary index by the method name and with value as an array of values :type X: :class:`dict` """ try: from sklearn import linear_model except ImportError: raise gridSize = [] gridInds = [] for key in paramDict.keys(): gridSize.append(paramDict[key].shape[0]) gridInds.append(numpy.arange(paramDict[key].shape[0])) betaGrid = numpy.ones(tuple(gridSize)) gridSize.insert(0, foldsSet.shape[0]) penalties = numpy.zeros(tuple(gridSize)) Cvs = numpy.array([1]) for i in range(foldsSet.shape[0]): folds = foldsSet[i] logging.debug("Folds " + str(folds)) idx = Sampling.crossValidation(folds, X.shape[0]) resultsList = self.parallelPen(X, y, idx, paramDict, Cvs) bestLearner, trainErrors, currentPenalties = resultsList[0] penalties[i, :] = currentPenalties indexIter = itertools.product(*gridInds) for inds in indexIter: inds2 = [slice(0, penalties.shape[0])] inds2.extend(inds) inds2 = tuple(inds2) tempPenalties = penalties[inds2] penInds = numpy.logical_and(numpy.isfinite(tempPenalties), tempPenalties > 0) penInds = numpy.squeeze(penInds) tempPenalties = tempPenalties[penInds].flatten() tempfoldsSet = numpy.array(foldsSet, numpy.float)[penInds] if tempPenalties.shape[0] > 1: xp = numpy.log((tempfoldsSet - 1) / tempfoldsSet * X.shape[0]) yp = numpy.log(tempPenalties) + numpy.log(tempfoldsSet) clf = linear_model.LinearRegression() clf.fit(numpy.array([xp]).T, yp) betaGrid[inds] = clf.coef_[0] return -betaGrid
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=True, colProbs=colProbs) testMetrics = numpy.zeros( (self.ks.shape[0], self.lmbdaUsers.shape[0], self.lmbdaItems.shape[0], self.gammas.shape[0], len(trainTestXs))) logging.debug( "Performing model selection with test leave out per row of " + str(self.validationSize)) paramList = [] for i, k in enumerate(self.ks): for j, lmbdaUser in enumerate(self.lmbdaUsers): for s, lmbdaItem in enumerate(self.lmbdaItems): for t, gamma in enumerate(self.gammas): for icv, (trainX, testX) in enumerate(trainTestXs): learner = self.copy() learner.k = k learner.lmbdaUser = lmbdaUser learner.lmbdaPos = lmbdaItem learner.lmbdaNeg = lmbdaItem learner.gamma = gamma paramList.append((trainX, testX, learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize) else: import itertools resultsIterator = itertools.imap(computeTestF1, paramList) for i, k in enumerate(self.ks): for j, lmbdaUser in enumerate(self.lmbdaUsers): for s, lmbdaPos in enumerate(self.lmbdaItems): for t, gamma in enumerate(self.gammas): for icv, (trainX, testX) in enumerate(trainTestXs): testMetrics[i, j, s, t, icv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testMetrics, 4) stdTestMetrics = numpy.std(testMetrics, 4) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdaUsers=" + str(self.lmbdaUsers)) logging.debug("lmbdaItems=" + str(self.lmbdaItems)) logging.debug("gammas=" + str(self.gammas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) indK, indLmdabUser, indLmbdaItem, indGamma = numpy.unravel_index( meanTestMetrics.argmax(), meanTestMetrics.shape) self.k = self.ks[indK] self.lmbdaUser = self.lmbdaUsers[indLmdabUser] self.lmbdaPos = self.lmbdaItems[indLmbdaItem] self.lmbdaNeg = self.lmbdaItems[indLmbdaItem] self.gamma = self.gammas[indGamma] logging.debug("Model parameters: " + str(self)) return meanTestMetrics, stdTestMetrics