def testAverageRocCurve(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix( (m, n), k, w, csarray=True, verbose=True, indsPerRow=200 ) fpr, tpr = MCEvaluator.averageRocCurve(X, U, V) import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt # plt.plot(fpr, tpr) # plt.show() # Now try case where we have a training set folds = 1 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
def testAverageRocCurve(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True, verbose=True, indsPerRow=200) fpr, tpr = MCEvaluator.averageRocCurve(X, U, V) import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt #plt.plot(fpr, tpr) #plt.show() #Now try case where we have a training set folds = 1 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
def parallelGridSearch(self, X, paramDict, evaluationMethod, testX=None, minVal=True): """ Perform parallel model selection using any learner. """ logging.debug("Parallel grid search with params: " + str(paramDict)) m, n = X.shape if testX == None: trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize) else: trainTestXs = [[X, testX]] gridSize = [] gridInds = [] for key in paramDict.keys(): gridSize.append(paramDict[key].shape[0]) gridInds.append(numpy.arange(paramDict[key].shape[0])) meanMetrics = numpy.zeros(tuple(gridSize)) paramList = [] for icv, (trainX, testX) in enumerate(trainTestXs): indexIter = itertools.product(*gridInds) for inds in indexIter: learner = self.copy() for i, (key, val) in enumerate(paramDict.items()): setattr(learner, key, val[inds[i]]) paramList.append((trainX, testX, learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize) else: resultsIterator = itertools.imap(evaluationMethod, paramList) for icv, (trainX, testX) in enumerate(trainTestXs): indexIter = itertools.product(*gridInds) for inds in indexIter: metric = resultsIterator.next() meanMetrics[inds] += metric / float(self.folds) if self.numProcesses != 1: pool.terminate() resultDict, bestMetric = self.setBestLearner(meanMetrics, paramDict, minVal) return meanMetrics
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape #cvInds = Sampling.randCrossValidation(self.folds, X.nnz) trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs) testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs))) if self.metric == "mrr": evaluationMethod = computeTestMRR elif self.metric == "f1": evaluationMethod = computeTestF1 else: raise ValueError("Invalid metric: " + self.metric) logging.debug("Performing model selection") paramList = [] for i, k in enumerate(self.ks): for j, lmbda in enumerate(self.lmbdas): for icv, (trainX, testX) in enumerate(trainTestXs): learner = self.copy() learner.k = k learner.lmbda = lmbda paramList.append((trainX.toScipyCsr(), testX.toScipyCsr(), learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize) else: import itertools resultsIterator = itertools.imap(evaluationMethod, paramList) for i, k in enumerate(self.ks): for j, lmbda in enumerate(self.lmbdas): for icv in range(len(trainTestXs)): testMetrics[i, j, icv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics= numpy.mean(testMetrics, 2) stdTestMetrics = numpy.std(testMetrics, 2) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdas=" + str(self.lmbdas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]] self.lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]] logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda)) return meanTestMetrics, stdTestMetrics
def main(): import sys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" data = numpy.loadtxt(matrixFileName) X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row") X[data[:, 0]-1, data[:, 1]-1] = numpy.array(data[:, 2]>3, numpy.int) logging.debug("Read file: " + matrixFileName) logging.debug("Shape of data: " + str(X.shape)) logging.debug("Number of non zeros " + str(X.nnz)) u = 0.1 w = 1-u (m, n) = X.shape validationSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize) trainX, testX = trainTestXs[0] trainX = trainX.toScipyCsr() learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001) learner.learnModel(trainX)
def profileLearnModel2(self): #Profile stochastic case #X = DatasetUtils.flixster() #X = Sampling.sampleUsers(X, 1000) X, U, V = DatasetUtils.syntheticDataset1(u=0.001, m=10000, n=1000) rho = 0.00 u = 0.2 w = 1 - u eps = 10**-6 alpha = 0.5 k = self.k maxLocalAuc = MaxLocalAUC(k, w, alpha=alpha, eps=eps, stochastic=True) maxLocalAuc.numRowSamples = 2 maxLocalAuc.numAucSamples = 10 maxLocalAuc.maxIterations = 1 maxLocalAuc.numRecordAucSamples = 100 maxLocalAuc.recordStep = 10 maxLocalAuc.initialAlg = "rand" maxLocalAuc.rate = "optimal" #maxLocalAuc.parallelSGD = True trainTestX = Sampling.shuffleSplitRows(X, maxLocalAuc.folds, 5) trainX, testX = trainTestX[0] def run(): U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel( trainX, True) #logging.debug("Train Precision@5=" + str(MCEvaluator.precisionAtK(trainX, U, V, 5))) #logging.debug("Train Precision@10=" + str(MCEvaluator.precisionAtK(trainX, U, V, 10))) #logging.debug("Train Precision@20=" + str(MCEvaluator.precisionAtK(trainX, U, V, 20))) #logging.debug("Train Precision@50=" + str(MCEvaluator.precisionAtK(trainX, U, V, 50))) #logging.debug("Test Precision@5=" + str(MCEvaluator.precisionAtK(testX, U, V, 5))) #logging.debug("Test Precision@10=" + str(MCEvaluator.precisionAtK(testX, U, V, 10))) #logging.debug("Test Precision@20=" + str(MCEvaluator.precisionAtK(testX, U, V, 20))) #logging.debug("Test Precision@50=" + str(MCEvaluator.precisionAtK(testX, U, V, 50))) ProfileUtils.profile('run()', globals(), locals())
def main(): import sys logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" data = numpy.loadtxt(matrixFileName) X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row") X[data[:, 0] - 1, data[:, 1] - 1] = numpy.array(data[:, 2] > 3, numpy.int) logging.debug("Read file: " + matrixFileName) logging.debug("Shape of data: " + str(X.shape)) logging.debug("Number of non zeros " + str(X.nnz)) u = 0.1 w = 1 - u (m, n) = X.shape validationSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize) trainX, testX = trainTestXs[0] trainX = trainX.toScipyCsr() learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001) learner.learnModel(trainX)
def testShuffleSplitRows(self): m = 10 n = 16 k = 5 u = 0.5 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200) #print(X.toarray()) k2 = 5 testSize = 2 trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=True) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] self.assertEquals(trainX.storagetype, "row") self.assertEquals(testX.storagetype, "row") nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m)) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=False) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] self.assertEquals(trainX.storagetype, "col") self.assertEquals(testX.storagetype, "col") nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m)) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, csarray=False) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) nptst.assert_array_equal(numpy.ravel(testX.sum(1)), testSize*numpy.ones(m)) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) testSize = 0 trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m)) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) self.assertEquals(testX.nnz, 0) #Test sampling a subset of the rows testSize = 2 numRows = 5 trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, numRows=numRows, rowMajor=False) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray()) self.assertEquals(numpy.nonzero(testX.sum(1))[0].shape[0], numRows) self.assertEquals(X.nnz, trainX.nnz + testX.nnz) self.assertEquals(testX.nnz, testSize*numRows) #Make sure column probabilities are correct w = 0.0 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200) testSize = 5 k2 = 500 colProbs = numpy.arange(0, n, dtype=numpy.float)+1 colProbs /= colProbs.sum() trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs) colProbs2 = numpy.zeros(n) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] colProbs2 += testX.sum(0) colProbs2 /= colProbs2.sum() nptst.assert_array_almost_equal(colProbs, colProbs2, 2) #Now test when probabilities are uniform colProbs = numpy.ones(n)/float(n) trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs) colProbs = None trainTestXs2 = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs) colProbs2 = numpy.zeros(n) colProbs3 = numpy.zeros(n) for i in range(k2): trainX = trainTestXs[i][0] testX = trainTestXs[i][1] colProbs2 += testX.sum(0) trainX = trainTestXs2[i][0] testX = trainTestXs2[i][1] colProbs3 += testX.sum(0) colProbs2 /= colProbs2.sum() colProbs3 /= colProbs3.sum() nptst.assert_array_almost_equal(colProbs2, colProbs3, 2) #Test when numRows=m numpy.random.seed(21) trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, numRows=m) numpy.random.seed(21) trainTestXs2 = Sampling.shuffleSplitRows(X, k2, testSize) nptst.assert_array_equal(trainTestXs[0][0].toarray(), trainTestXs2[0][0].toarray()) nptst.assert_array_equal(trainTestXs[0][1].toarray(), trainTestXs2[0][1].toarray())
def parallelLearnModel(self, X, verbose=False, U=None, V=None): """ Max local AUC with Frobenius norm penalty on V. Solve with parallel (stochastic) gradient descent. The input is a sparse array. """ # Convert to a csarray for faster access if scipy.sparse.issparse(X): logging.debug("Converting to csarray") X2 = sppy.csarray(X, storagetype="row") X = X2 m, n = X.shape # We keep a validation set in order to determine when to stop if self.validationUsers != 0: numValidationUsers = int(m * self.validationUsers) trainX, testX, rowSamples = Sampling.shuffleSplitRows( X, 1, self.validationSize, numRows=numValidationUsers )[0] testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX) else: trainX = X testX = None rowSamples = None testIndPtr, testColInds = None, None # Not that to compute the test AUC we pick i \in X and j \notin X \cup testX indPtr, colInds = SparseUtils.getOmegaListPtr(trainX) allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X) if U == None or V == None: U, V = self.initUV(trainX) if self.metric == "f1": metricInd = 2 elif self.metric == "mrr": metricInd = 3 else: raise ValueError("Unknown metric: " + self.metric) bestMetric = 0 bestU = 0 bestV = 0 trainMeasures = [] testMeasures = [] loopInd = 0 lastObj = 0 currentObj = lastObj - 2 * self.eps numBlocks = self.numProcesses + 1 gi, gp, gq = self.computeGipq(X) normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m) # Some shared variables rowIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool) colIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool) # Create shared factors U2 = sharedmem.zeros((m, self.k)) V2 = sharedmem.zeros((n, self.k)) muU2 = sharedmem.zeros((m, self.k)) muV2 = sharedmem.zeros((n, self.k)) U2[:] = U[:] V2[:] = V[:] muU2[:] = U[:] muV2[:] = V[:] del U, V rowBlockSize = int(numpy.ceil(float(m) / numBlocks)) colBlockSize = int(numpy.ceil(float(n) / numBlocks)) lock = multiprocessing.Lock() startTime = time.time() loopInd = 0 iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks)) self.learnerCython = self.getCythonLearner() nextRecord = 0 while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps: if loopInd >= nextRecord: if loopInd != 0: print("") printStr = self.recordResults( muU2, muV2, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime, ) logging.debug(printStr) if testIndPtr is not None and testMeasures[-1][metricInd] >= bestMetric: bestMetric = testMeasures[-1][metricInd] bestU = muU2.copy() bestV = muV2.copy() elif testIndPtr is None: bestU = muU2.copy() bestV = muV2.copy() # Compute objective averaged over last 5 recorded steps trainMeasuresArr = numpy.array(trainMeasures) lastObj = currentObj currentObj = numpy.mean(trainMeasuresArr[-5:, 0]) nextRecord += self.recordStep iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks)) self.parallelUpdateUV( X, U2, V2, muU2, muV2, numBlocks, rowBlockSize, colBlockSize, rowIsFree, colIsFree, indPtr, colInds, lock, gi, gp, gq, normGp, normGq, iterationsPerBlock, loopInd, ) loopInd += numpy.floor(iterationsPerBlock.mean()) totalTime = time.time() - startTime # Compute quantities for last U and V print("") totalTime = time.time() - startTime printStr = "Finished, time=" + str("%.1f" % totalTime) + " " printStr += self.recordResults( muU2, muV2, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime, ) printStr += " delta obj" + "%.3e" % abs(lastObj - currentObj) logging.debug(printStr) self.U = bestU self.V = bestV self.gi = gi self.gp = gp self.gq = gq if verbose: return self.U, self.V, numpy.array(trainMeasures), numpy.array(testMeasures), loopInd, totalTime else: return self.U, self.V
def singleLearnModel(self, X, verbose=False, U=None, V=None): """ Max local AUC with Frobenius norm penalty on V. Solve with (stochastic) gradient descent. The input is a sparse array. """ # Convert to a csarray for faster access if scipy.sparse.issparse(X): logging.debug("Converting to csarray") X2 = sppy.csarray(X, storagetype="row") X = X2 m, n = X.shape # We keep a validation set in order to determine when to stop if self.validationUsers != 0: numValidationUsers = int(m * self.validationUsers) trainX, testX, rowSamples = Sampling.shuffleSplitRows( X, 1, self.validationSize, numRows=numValidationUsers )[0] testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX) logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz)) logging.debug("Validation X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz)) else: trainX = X testX = None rowSamples = None testIndPtr, testColInds = None, None # Note that to compute the test AUC we pick i \in X and j \notin X \cup testX indPtr, colInds = SparseUtils.getOmegaListPtr(trainX) allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X) if type(U) != numpy.ndarray and type(V) != numpy.ndarray: U, V = self.initUV(trainX) if self.metric == "f1": metricInd = 2 elif self.metric == "mrr": metricInd = 3 else: raise ValueError("Unknown metric: " + self.metric) muU = U.copy() muV = V.copy() bestMetric = 0 bestU = 0 bestV = 0 trainMeasures = [] testMeasures = [] loopInd = 0 lastObj = 0 currentObj = lastObj - 2 * self.eps # Try alternative number of iterations # numIterations = trainX.nnz/self.numAucSamples numIterations = max(m, n) self.learnerCython = self.getCythonLearner() # Set up order of indices for stochastic methods permutedRowInds = numpy.array(numpy.random.permutation(m), numpy.uint32) permutedColInds = numpy.array(numpy.random.permutation(n), numpy.uint32) startTime = time.time() gi, gp, gq = self.computeGipq(X) normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m) while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps: sigmaU = self.getSigma(loopInd, self.alpha, m) sigmaV = self.getSigma(loopInd, self.alpha, m) if loopInd % self.recordStep == 0: if loopInd != 0 and self.stochastic: print("") printStr = self.recordResults( muU, muV, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime, ) logging.debug(printStr) if testIndPtr is not None and testMeasures[-1][metricInd] >= bestMetric: bestMetric = testMeasures[-1][metricInd] logging.debug("Current best metric=" + str(bestMetric)) bestU = muU.copy() bestV = muV.copy() elif testIndPtr is None: bestU = muU.copy() bestV = muV.copy() # Compute objective averaged over last 5 recorded steps trainMeasuresArr = numpy.array(trainMeasures) lastObj = currentObj currentObj = numpy.mean(trainMeasuresArr[-5:, 0]) U = numpy.ascontiguousarray(U) self.updateUV( indPtr, colInds, U, V, muU, muV, permutedRowInds, permutedColInds, gp, gq, normGp, normGq, loopInd, sigmaU, sigmaV, numIterations, ) loopInd += 1 # Compute quantities for last U and V totalTime = time.time() - startTime printStr = "\nFinished, time=" + str("%.1f" % totalTime) + " " printStr += self.recordResults( muU, muV, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime, ) printStr += " delta obj=" + "%.3e" % abs(lastObj - currentObj) logging.debug(printStr) self.U = bestU self.V = bestV self.gi = gi self.gp = gp self.gq = gq trainMeasures = numpy.array(trainMeasures) testMeasures = numpy.array(testMeasures) if verbose: return self.U, self.V, trainMeasures, testMeasures, loopInd, totalTime else: return self.U, self.V
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=True, colProbs=colProbs) testMetrics = numpy.zeros( (self.ks.shape[0], self.lmbdaUsers.shape[0], self.lmbdaItems.shape[0], self.gammas.shape[0], len(trainTestXs))) logging.debug( "Performing model selection with test leave out per row of " + str(self.validationSize)) paramList = [] for i, k in enumerate(self.ks): for j, lmbdaUser in enumerate(self.lmbdaUsers): for s, lmbdaItem in enumerate(self.lmbdaItems): for t, gamma in enumerate(self.gammas): for icv, (trainX, testX) in enumerate(trainTestXs): learner = self.copy() learner.k = k learner.lmbdaUser = lmbdaUser learner.lmbdaPos = lmbdaItem learner.lmbdaNeg = lmbdaItem learner.gamma = gamma paramList.append((trainX, testX, learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize) else: import itertools resultsIterator = itertools.imap(computeTestF1, paramList) for i, k in enumerate(self.ks): for j, lmbdaUser in enumerate(self.lmbdaUsers): for s, lmbdaPos in enumerate(self.lmbdaItems): for t, gamma in enumerate(self.gammas): for icv, (trainX, testX) in enumerate(trainTestXs): testMetrics[i, j, s, t, icv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testMetrics, 4) stdTestMetrics = numpy.std(testMetrics, 4) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdaUsers=" + str(self.lmbdaUsers)) logging.debug("lmbdaItems=" + str(self.lmbdaItems)) logging.debug("gammas=" + str(self.gammas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) indK, indLmdabUser, indLmbdaItem, indGamma = numpy.unravel_index( meanTestMetrics.argmax(), meanTestMetrics.shape) self.k = self.ks[indK] self.lmbdaUser = self.lmbdaUsers[indLmdabUser] self.lmbdaPos = self.lmbdaItems[indLmbdaItem] self.lmbdaNeg = self.lmbdaItems[indLmbdaItem] self.gamma = self.gammas[indGamma] logging.debug("Model parameters: " + str(self)) return meanTestMetrics, stdTestMetrics
def parallelLearnModel(self, X, verbose=False, U=None, V=None): """ Max local AUC with Frobenius norm penalty on V. Solve with parallel (stochastic) gradient descent. The input is a sparse array. """ #Convert to a csarray for faster access if scipy.sparse.issparse(X): logging.debug("Converting to csarray") X2 = sppy.csarray(X, storagetype="row") X = X2 m, n = X.shape #We keep a validation set in order to determine when to stop if self.validationUsers != 0: numValidationUsers = int(m * self.validationUsers) trainX, testX, rowSamples = Sampling.shuffleSplitRows( X, 1, self.validationSize, numRows=numValidationUsers)[0] testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX) else: trainX = X testX = None rowSamples = None testIndPtr, testColInds = None, None #Not that to compute the test AUC we pick i \in X and j \notin X \cup testX indPtr, colInds = SparseUtils.getOmegaListPtr(trainX) allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X) if U == None or V == None: U, V = self.initUV(trainX) if self.metric == "f1": metricInd = 2 elif self.metric == "mrr": metricInd = 3 else: raise ValueError("Unknown metric: " + self.metric) bestMetric = 0 bestU = 0 bestV = 0 trainMeasures = [] testMeasures = [] loopInd = 0 lastObj = 0 currentObj = lastObj - 2 * self.eps numBlocks = self.numProcesses + 1 gi, gp, gq = self.computeGipq(X) normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m) #Some shared variables rowIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool) colIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool) #Create shared factors U2 = sharedmem.zeros((m, self.k)) V2 = sharedmem.zeros((n, self.k)) muU2 = sharedmem.zeros((m, self.k)) muV2 = sharedmem.zeros((n, self.k)) U2[:] = U[:] V2[:] = V[:] muU2[:] = U[:] muV2[:] = V[:] del U, V rowBlockSize = int(numpy.ceil(float(m) / numBlocks)) colBlockSize = int(numpy.ceil(float(n) / numBlocks)) lock = multiprocessing.Lock() startTime = time.time() loopInd = 0 iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks)) self.learnerCython = self.getCythonLearner() nextRecord = 0 while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps: if loopInd >= nextRecord: if loopInd != 0: print("") printStr = self.recordResults(muU2, muV2, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime) logging.debug(printStr) if testIndPtr is not None and testMeasures[-1][ metricInd] >= bestMetric: bestMetric = testMeasures[-1][metricInd] bestU = muU2.copy() bestV = muV2.copy() elif testIndPtr is None: bestU = muU2.copy() bestV = muV2.copy() #Compute objective averaged over last 5 recorded steps trainMeasuresArr = numpy.array(trainMeasures) lastObj = currentObj currentObj = numpy.mean(trainMeasuresArr[-5:, 0]) nextRecord += self.recordStep iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks)) self.parallelUpdateUV(X, U2, V2, muU2, muV2, numBlocks, rowBlockSize, colBlockSize, rowIsFree, colIsFree, indPtr, colInds, lock, gi, gp, gq, normGp, normGq, iterationsPerBlock, loopInd) loopInd += numpy.floor(iterationsPerBlock.mean()) totalTime = time.time() - startTime #Compute quantities for last U and V print("") totalTime = time.time() - startTime printStr = "Finished, time=" + str('%.1f' % totalTime) + " " printStr += self.recordResults(muU2, muV2, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime) printStr += " delta obj" + "%.3e" % abs(lastObj - currentObj) logging.debug(printStr) self.U = bestU self.V = bestV self.gi = gi self.gp = gp self.gq = gq if verbose: return self.U, self.V, numpy.array(trainMeasures), numpy.array( testMeasures), loopInd, totalTime else: return self.U, self.V
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=True, colProbs=colProbs) testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdaUsers.shape[0], self.lmbdaItems.shape[0], self.gammas.shape[0], len(trainTestXs))) logging.debug("Performing model selection with test leave out per row of " + str(self.validationSize)) paramList = [] for i, k in enumerate(self.ks): for j, lmbdaUser in enumerate(self.lmbdaUsers): for s, lmbdaItem in enumerate(self.lmbdaItems): for t, gamma in enumerate(self.gammas): for icv, (trainX, testX) in enumerate(trainTestXs): learner = self.copy() learner.k = k learner.lmbdaUser = lmbdaUser learner.lmbdaPos = lmbdaItem learner.lmbdaNeg = lmbdaItem learner.gamma = gamma paramList.append((trainX, testX, learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize) else: import itertools resultsIterator = itertools.imap(computeTestF1, paramList) for i, k in enumerate(self.ks): for j, lmbdaUser in enumerate(self.lmbdaUsers): for s, lmbdaPos in enumerate(self.lmbdaItems): for t, gamma in enumerate(self.gammas): for icv, (trainX, testX) in enumerate(trainTestXs): testMetrics[i, j, s, t, icv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testMetrics, 4) stdTestMetrics = numpy.std(testMetrics, 4) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdaUsers=" + str(self.lmbdaUsers)) logging.debug("lmbdaItems=" + str(self.lmbdaItems)) logging.debug("gammas=" + str(self.gammas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) indK, indLmdabUser, indLmbdaItem, indGamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape) self.k = self.ks[indK] self.lmbdaUser = self.lmbdaUsers[indLmdabUser] self.lmbdaPos = self.lmbdaItems[indLmbdaItem] self.lmbdaNeg = self.lmbdaItems[indLmbdaItem] self.gamma = self.gammas[indGamma] logging.debug("Model parameters: " + str(self)) return meanTestMetrics, stdTestMetrics
def runExperiment(self, X): """ Run the selected ranking experiments and save results """ logging.debug("Splitting into train and test sets") #Make sure different runs get the same train/test split numpy.random.seed(21) m, n = X.shape #colProbs = (X.sum(0)+1)/float(m+1) #colProbs = colProbs**-self.algoArgs.itemExp #colProbs = numpy.ones(n)/float(n) trainTestXs = Sampling.shuffleSplitRows(X, 1, self.algoArgs.testSize) trainX, testX = trainTestXs[0] logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz)) logging.debug("Test X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz)) #Have scipy versions of each array trainXScipy = trainX.toScipyCsc() testXScipy = testX.toScipyCsc() if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") resultsFileName = self.resultsDir + "ResultsSoftImpute.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainXScipy, self.algoArgs.modelSelectSamples, prune=True) try: learner = IterativeSoftImpute(self.algoArgs.rhoSi, eps=self.algoArgs.epsSi, k=self.algoArgs.k, svdAlg=self.algoArgs.svdAlg, postProcess=self.algoArgs.postProcess, p=self.algoArgs.pSi, q=self.algoArgs.qSi) learner.folds = self.algoArgs.folds learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: cvInds = Sampling.randCrossValidation(self.algoArgs.folds, modelSelectX.nnz) meanErrors, stdErrors = learner.modelSelect2(modelSelectX, self.algoArgs.rhosSi, self.algoArgs.ks, cvInds) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runMaxLocalAuc: logging.debug("Running max local AUC") if self.algoArgs.loss != "tanh": resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + ".npz" else: resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + "_rho=" + str(self.algoArgs.rhoMlauc) + ".npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: learner = MaxLocalAUC(self.algoArgs.k, 1-self.algoArgs.u, lmbdaU=self.algoArgs.lmbdaUMlauc, lmbdaV=self.algoArgs.lmbdaVMlauc, eps=self.algoArgs.epsMlauc, stochastic=not self.algoArgs.fullGradient) learner.alpha = self.algoArgs.alpha learner.alphas = self.algoArgs.alphas learner.eta = self.algoArgs.eta learner.folds = self.algoArgs.folds learner.initialAlg = self.algoArgs.initialAlg learner.itemExpP = self.algoArgs.itemExpP learner.itemExpQ = self.algoArgs.itemExpQ learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasMlauc learner.loss = self.algoArgs.loss learner.maxIterations = self.algoArgs.maxIterations learner.maxNorms = self.algoArgs.maxNorms learner.maxNormU = self.algoArgs.maxNorm learner.maxNormV = self.algoArgs.maxNorm learner.metric = self.algoArgs.metric learner.normalise = self.algoArgs.normalise learner.numAucSamples = self.algoArgs.numAucSamples learner.numProcesses = self.algoArgs.processes learner.numRowSamples = self.algoArgs.numRowSamples learner.rate = self.algoArgs.rate learner.recommendSize = self.algoArgs.recommendSize learner.recordStep = self.algoArgs.recordStep learner.rho = self.algoArgs.rhoMlauc learner.rhos = self.algoArgs.rhosMlauc learner.startAverage = self.algoArgs.startAverage learner.t0 = self.algoArgs.t0 learner.t0s = self.algoArgs.t0s learner.validationSize = self.algoArgs.validationSize learner.validationUsers = self.algoArgs.validationUsers modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") if self.algoArgs.modelSelect and not os.path.isfile(modelSelectFileName): logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) # meanMetricsLR, paramDictLR = learner.learningRateSelect(modelSelectX) meanMetricsMS, paramDictMS = learner.modelSelectLmbda(modelSelectX) numpy.savez(modelSelectFileName, meanMetricsLR, meanMetricsMS) logging.debug("Saved model selection grid as " + modelSelectFileName) elif self.algoArgs.modelSelect: data = numpy.load(modelSelectFileName) logging.debug("Read model selection file " + modelSelectFileName) meanMetricsLR = data["arr_0"] meanMetricsMS = data["arr_1"] learner.learningRateSelect(meanMetrics=meanMetricsLR) learner.modelSelectLmbda(meanMetrics=meanMetricsMS) #Turn on (optionally) parallel SGD only at the final learning stage learner.parallelSGD = self.algoArgs.parallelSGD learner.maxIterations *= 2 logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runWarpMf: logging.debug("Running WARP loss MF") resultsFileName = self.resultsDir + "ResultsWarpMf.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: learner = WarpMf(self.algoArgs.k, self.algoArgs.lmbdas[0], u=self.algoArgs.u) learner.ks = self.algoArgs.ks learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) logging.debug("Mean local AUCs = " + str(meanAucs)) logging.debug("Std local AUCs = " + str(stdAucs)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runWrMf: logging.debug("Running Weighted Regularized Matrix Factorization") resultsFileName = self.resultsDir + "ResultsWrMf.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() trainXScipy = trainXScipy.tocsr() testXScipy = testXScipy.tocsr() try: learner = WeightedMf(self.algoArgs.k, alpha=self.algoArgs.alphaWrMf, lmbda=self.algoArgs.lmbdasWrMf[0], maxIterations=self.algoArgs.maxIterationsWrMf) learner.folds = self.algoArgs.folds learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasWrMf learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runBpr: logging.debug("Running Bayesian Personalised Recommendation") resultsFileName = self.resultsDir + "ResultsBpr.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: #trainX = trainX.toScipyCsr() #testX = testX.toScipyCsr() learner = BprRecommender(self.algoArgs.k, lmbdaUser=self.algoArgs.lmbdaUserBpr, lmbdaPos=self.algoArgs.lmbdaItemBpr, lmbdaNeg=self.algoArgs.lmbdaItemBpr, gamma=self.algoArgs.gammaBpr) learner.folds = self.algoArgs.folds learner.gammas = self.algoArgs.gammasBpr learner.ks = self.algoArgs.ks learner.lmbdaItems = self.algoArgs.lmbdaItems learner.lmbdaUsers = self.algoArgs.lmbdaUsers learner.maxIterations = self.algoArgs.maxIterationsBpr learner.metric = self.algoArgs.metric #learner.numAucSamples = self.algoArgs.numAucSamples learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.recordStep = self.algoArgs.recordStep learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runKnn: logging.debug("Running kNN") resultsFileName = self.resultsDir + "ResultsKnn.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: trainX = trainX.toScipyCsr() testX = testX.toScipyCsr() learner = KNNRecommender(self.algoArgs.kns[0]) learner.numProcesses = self.algoArgs.processes logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runCLiMF: # !!!! no model selection logging.debug("Running CLiMF") resultsFileName = self.resultsDir + "ResultsCLiMF.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) modelSelectX = scipy.sparse.csr_matrix(modelSelectX.toScipyCsr(), dtype=numpy.float64) trainX = scipy.sparse.csr_matrix(trainX.toScipyCsr(), dtype=numpy.float64) testX = testX.toScipyCsr() learner = CLiMF(self.algoArgs.k, self.algoArgs.lmbdaCLiMF, self.algoArgs.gammaCLiMF) learner.folds = self.algoArgs.folds learner.gammas = self.algoArgs.gammasCLiMF learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasCLiMF learner.max_iters = self.algoArgs.maxIterCLiMF learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize learner.verbose = self.algoArgs.verbose if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) meanObjs, stdObjs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanObjs, stdObjs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, colProbs=colProbs) datas = [] for (trainX, testX) in trainTestXs: testOmegaList = SparseUtils.getOmegaList(testX) #testX = trainX+testX datas.append((trainX, testX, testOmegaList)) testAucs = numpy.zeros((len(self.ks), len(self.lmbdas), len(self.gammas), len(trainTestXs))) logging.debug("Performing model selection") paramList = [] for i, k in enumerate(self.ks): U, V = self.initUV(X, k) for lmbda in self.lmbdas: for gamma in self.gammas: for (trainX, testX, testOmegaList) in datas: learner = self.copy() learner.k = k learner.U = U.copy() learner.V = V.copy() learner.lmbda = lmbda learner.gamma = gamma paramList.append((scipy.sparse.csr_matrix(trainX, dtype=numpy.float64), scipy.sparse.csr_matrix(testX, dtype=numpy.float64), learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize) else: resultsIterator = itertools.imap(computeTestF1, paramList) for i_k in range(len(self.ks)): for i_lmbda in range(len(self.lmbdas)): for i_gamma in range(len(self.gammas)): for i_cv in range(len(trainTestXs)): testAucs[i_k, i_lmbda, i_gamma, i_cv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testAucs, 3) stdTestMetrics = numpy.std(testAucs, 3) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdas=" + str(self.lmbdas)) logging.debug("gammas=" + str(self.gammas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) i_k, i_lmbda, i_gamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape) self.k = self.ks[i_k] self.lmbda = self.lmbdas[i_lmbda] self.gamma = self.gammas[i_gamma] logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda) + " gamma=" + str(self.gamma)) return meanTestMetrics, stdTestMetrics
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, colProbs=colProbs) datas = [] for (trainX, testX) in trainTestXs: testOmegaList = SparseUtils.getOmegaList(testX) #testX = trainX+testX datas.append((trainX, testX, testOmegaList)) testAucs = numpy.zeros((len(self.ks), len(self.lmbdas), len(self.gammas), len(trainTestXs))) logging.debug("Performing model selection") paramList = [] for i, k in enumerate(self.ks): U, V = self.initUV(X, k) for lmbda in self.lmbdas: for gamma in self.gammas: for (trainX, testX, testOmegaList) in datas: learner = self.copy() learner.k = k learner.U = U.copy() learner.V = V.copy() learner.lmbda = lmbda learner.gamma = gamma paramList.append( (scipy.sparse.csr_matrix(trainX, dtype=numpy.float64), scipy.sparse.csr_matrix(testX, dtype=numpy.float64), learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize) else: resultsIterator = itertools.imap(computeTestF1, paramList) for i_k in range(len(self.ks)): for i_lmbda in range(len(self.lmbdas)): for i_gamma in range(len(self.gammas)): for i_cv in range(len(trainTestXs)): testAucs[i_k, i_lmbda, i_gamma, i_cv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testAucs, 3) stdTestMetrics = numpy.std(testAucs, 3) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdas=" + str(self.lmbdas)) logging.debug("gammas=" + str(self.gammas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) i_k, i_lmbda, i_gamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape) self.k = self.ks[i_k] self.lmbda = self.lmbdas[i_lmbda] self.gamma = self.gammas[i_gamma] logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda) + " gamma=" + str(self.gamma)) return meanTestMetrics, stdTestMetrics
def singleLearnModel(self, X, verbose=False, U=None, V=None): """ Max local AUC with Frobenius norm penalty on V. Solve with (stochastic) gradient descent. The input is a sparse array. """ #Convert to a csarray for faster access if scipy.sparse.issparse(X): logging.debug("Converting to csarray") X2 = sppy.csarray(X, storagetype="row") X = X2 m, n = X.shape #We keep a validation set in order to determine when to stop if self.validationUsers != 0: numValidationUsers = int(m * self.validationUsers) trainX, testX, rowSamples = Sampling.shuffleSplitRows( X, 1, self.validationSize, numRows=numValidationUsers)[0] testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX) logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz)) logging.debug("Validation X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz)) else: trainX = X testX = None rowSamples = None testIndPtr, testColInds = None, None #Note that to compute the test AUC we pick i \in X and j \notin X \cup testX indPtr, colInds = SparseUtils.getOmegaListPtr(trainX) allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X) if type(U) != numpy.ndarray and type(V) != numpy.ndarray: U, V = self.initUV(trainX) if self.metric == "f1": metricInd = 2 elif self.metric == "mrr": metricInd = 3 else: raise ValueError("Unknown metric: " + self.metric) muU = U.copy() muV = V.copy() bestMetric = 0 bestU = 0 bestV = 0 trainMeasures = [] testMeasures = [] loopInd = 0 lastObj = 0 currentObj = lastObj - 2 * self.eps #Try alternative number of iterations #numIterations = trainX.nnz/self.numAucSamples numIterations = max(m, n) self.learnerCython = self.getCythonLearner() #Set up order of indices for stochastic methods permutedRowInds = numpy.array(numpy.random.permutation(m), numpy.uint32) permutedColInds = numpy.array(numpy.random.permutation(n), numpy.uint32) startTime = time.time() gi, gp, gq = self.computeGipq(X) normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m) while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps: sigmaU = self.getSigma(loopInd, self.alpha, m) sigmaV = self.getSigma(loopInd, self.alpha, m) if loopInd % self.recordStep == 0: if loopInd != 0 and self.stochastic: print("") printStr = self.recordResults(muU, muV, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime) logging.debug(printStr) if testIndPtr is not None and testMeasures[-1][ metricInd] >= bestMetric: bestMetric = testMeasures[-1][metricInd] logging.debug("Current best metric=" + str(bestMetric)) bestU = muU.copy() bestV = muV.copy() elif testIndPtr is None: bestU = muU.copy() bestV = muV.copy() #Compute objective averaged over last 5 recorded steps trainMeasuresArr = numpy.array(trainMeasures) lastObj = currentObj currentObj = numpy.mean(trainMeasuresArr[-5:, 0]) U = numpy.ascontiguousarray(U) self.updateUV(indPtr, colInds, U, V, muU, muV, permutedRowInds, permutedColInds, gp, gq, normGp, normGq, loopInd, sigmaU, sigmaV, numIterations) loopInd += 1 #Compute quantities for last U and V totalTime = time.time() - startTime printStr = "\nFinished, time=" + str('%.1f' % totalTime) + " " printStr += self.recordResults(muU, muV, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime) printStr += " delta obj=" + "%.3e" % abs(lastObj - currentObj) logging.debug(printStr) self.U = bestU self.V = bestV self.gi = gi self.gp = gp self.gq = gq trainMeasures = numpy.array(trainMeasures) testMeasures = numpy.array(testMeasures) if verbose: return self.U, self.V, trainMeasures, testMeasures, loopInd, totalTime else: return self.U, self.V
dataset = sys.argv[1] else: dataset = "synthetic" saveResults = True prefix = "LossROC" outputFile = PathDefaults.getOutputDir() + "ranking/" + prefix + dataset.title() + "Results.npz" X = DatasetUtils.getDataset(dataset, nnz=20000) m, n = X.shape u = 0.1 w = 1-u testSize = 5 folds = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) numRecordAucSamples = 200 k2 = 8 u2 = 0.5 w2 = 1-u2 eps = 10**-4 lmbda = 0.0 maxLocalAuc = MaxLocalAUC(k2, w2, eps=eps, lmbdaU=lmbda, lmbdaV=lmbda, stochastic=True) maxLocalAuc.alpha = 0.05 maxLocalAuc.alphas = 2.0**-numpy.arange(0, 5, 1) maxLocalAuc.folds = 1 maxLocalAuc.initialAlg = "rand" maxLocalAuc.itemExpP = 0.0 maxLocalAuc.itemExpQ = 0.0
def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None): """ Pick a value of rho based on a single matrix X. We do cross validation within, and return the best value of lambda (according to the mean squared error). The rhos must be in decreasing order and we use warm restarts. In this case we remove a few non zeros from each row to form the test set. """ if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): raise ValueError("rhos must be in descending order") trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, rowMajor=False, colProbs=colProbs) metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds))) if self.metric == "mse": metricFuction = learnPredictMSE elif self.metric == "f1" or self.metric == "mrr": metricFuction = learnPredictRanking else: raise ValueError("Unknown metric: " + self.metric) paramList = [] for i, (trainX, testX) in enumerate(trainTestXs): Util.printIteration(i, 1, len(cvInds), "Fold: ") for m, k in enumerate(ks): learner = self.copy() learner.updateAlg="initial" learner.setK(k) paramList.append((learner, trainX, testX, rhos)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=10) resultsIter = pool.imap(metricFuction, paramList) else: resultsIter = itertools.imap(metricFuction, paramList) for i, (trainX, testX) in enumerate(trainTestXs): for m, k in enumerate(ks): metrics[:, m, i] = resultsIter.next() if self.numProcesses != 1: pool.terminate() meanMetrics = metrics.mean(2) stdMetrics = metrics.std(2) logging.debug("ks=" + str(ks)) logging.debug("rhos=" + str(rhos)) logging.debug(meanMetrics) #Set the parameters if self.metric == "mse": self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]]) elif self.metric == "f1" or self.metric == "mrr": self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]]) logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho)) return meanMetrics, stdMetrics
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape #cvInds = Sampling.randCrossValidation(self.folds, X.nnz) trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs) testMetrics = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs))) if self.metric == "mrr": evaluationMethod = computeTestMRR elif self.metric == "f1": evaluationMethod = computeTestF1 else: raise ValueError("Invalid metric: " + self.metric) logging.debug("Performing model selection") paramList = [] for i, k in enumerate(self.ks): for j, lmbda in enumerate(self.lmbdas): for icv, (trainX, testX) in enumerate(trainTestXs): learner = self.copy() learner.k = k learner.lmbda = lmbda paramList.append( (trainX.toScipyCsr(), testX.toScipyCsr(), learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize) else: import itertools resultsIterator = itertools.imap(evaluationMethod, paramList) for i, k in enumerate(self.ks): for j, lmbda in enumerate(self.lmbdas): for icv in range(len(trainTestXs)): testMetrics[i, j, icv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testMetrics, 2) stdTestMetrics = numpy.std(testMetrics, 2) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdas=" + str(self.lmbdas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]] self.lmbda = self.lmbdas[numpy.unravel_index( numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]] logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda)) return meanTestMetrics, stdTestMetrics
def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None): """ Pick a value of rho based on a single matrix X. We do cross validation within, and return the best value of lambda (according to the mean squared error). The rhos must be in decreasing order and we use warm restarts. In this case we remove a few non zeros from each row to form the test set. """ if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): raise ValueError("rhos must be in descending order") trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, rowMajor=False, colProbs=colProbs) metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds))) if self.metric == "mse": metricFuction = learnPredictMSE elif self.metric == "f1" or self.metric == "mrr": metricFuction = learnPredictRanking else: raise ValueError("Unknown metric: " + self.metric) paramList = [] for i, (trainX, testX) in enumerate(trainTestXs): Util.printIteration(i, 1, len(cvInds), "Fold: ") for m, k in enumerate(ks): learner = self.copy() learner.updateAlg = "initial" learner.setK(k) paramList.append((learner, trainX, testX, rhos)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=10) resultsIter = pool.imap(metricFuction, paramList) else: resultsIter = itertools.imap(metricFuction, paramList) for i, (trainX, testX) in enumerate(trainTestXs): for m, k in enumerate(ks): metrics[:, m, i] = resultsIter.next() if self.numProcesses != 1: pool.terminate() meanMetrics = metrics.mean(2) stdMetrics = metrics.std(2) logging.debug("ks=" + str(ks)) logging.debug("rhos=" + str(rhos)) logging.debug(meanMetrics) #Set the parameters if self.metric == "mse": self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]]) elif self.metric == "f1" or self.metric == "mrr": self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]]) logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho)) return meanMetrics, stdMetrics