def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) precisions = numpy.zeros((self.ks.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): learner = self.copy() learner.k = k paramList.append((trainX, testX, testOmegaList, learner)) #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize) import itertools resultsIterator = itertools.imap(computePrecision, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempPrecision = resultsIterator.next() precisions[i, icv] = tempPrecision #pool.terminate() meanPrecisions = numpy.mean(precisions, 1) stdPrecisions = numpy.std(precisions, 1) logging.debug(meanPrecisions) k = self.ks[numpy.argmax(meanPrecisions)] logging.debug("Model parameters: k=" + str(k)) self.k = k return meanPrecisions, stdPrecisions
def testSubmatrix(self): import sppy numRuns = 100 for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) X = scipy.sparse.rand(m, n, 0.5) X = X.tocsc() inds1 = numpy.arange(0, X.nnz/2) inds2 = numpy.arange(X.nnz/2, X.nnz) X1 = SparseUtils.submatrix(X, inds1) X2 = SparseUtils.submatrix(X, inds2) nptst.assert_array_almost_equal((X1+X2).todense(), X.todense()) inds = X.nnz X1 = SparseUtils.submatrix(X, inds) nptst.assert_array_almost_equal((X1).todense(), X.todense()) inds = 2 X1 = SparseUtils.submatrix(X, inds) self.assertTrue(X1.nnz, 2) #Test with sppy for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) X = scipy.sparse.rand(m, n, 0.5) X = X.tocsc() X = sppy.csarray(X) inds1 = numpy.arange(0, X.nnz/2) inds2 = numpy.arange(X.nnz/2, X.nnz) X1 = SparseUtils.submatrix(X, inds1) X2 = SparseUtils.submatrix(X, inds2) nptst.assert_array_almost_equal((X1+X2).toarray(), X.toarray())
def testSubmatrix(self): import sppy numRuns = 100 for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) X = scipy.sparse.rand(m, n, 0.5) X = X.tocsc() inds1 = numpy.arange(0, X.nnz / 2) inds2 = numpy.arange(X.nnz / 2, X.nnz) X1 = SparseUtils.submatrix(X, inds1) X2 = SparseUtils.submatrix(X, inds2) nptst.assert_array_almost_equal((X1 + X2).todense(), X.todense()) inds = X.nnz X1 = SparseUtils.submatrix(X, inds) nptst.assert_array_almost_equal((X1).todense(), X.todense()) inds = 2 X1 = SparseUtils.submatrix(X, inds) self.assertTrue(X1.nnz, 2) #Test with sppy for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) X = scipy.sparse.rand(m, n, 0.5) X = X.tocsc() X = sppy.csarray(X) inds1 = numpy.arange(0, X.nnz / 2) inds2 = numpy.arange(X.nnz / 2, X.nnz) X1 = SparseUtils.submatrix(X, inds1) X2 = SparseUtils.submatrix(X, inds2) nptst.assert_array_almost_equal((X1 + X2).toarray(), X.toarray())
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX + trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append( (trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array( list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape( (nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not( numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin / (nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin % nLG) / nGamma))] gammaMin = gammas[int(scipy.floor(iMin % nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin / (nLG))), int(scipy.floor((iMin % nLG) / nGamma)), int(scipy.floor(iMin % nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return
def modelSelect(self, X, rhos, ks, cvInds): """ Pick a value of rho based on a single matrix X. We do cross validation within, and return the best value of lambda (according to the mean squared error). The rhos must be in decreasing order and we use warm restarts. """ if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): raise ValueError("rhos must be in descending order") errors = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds))) if self.metric == "mse": metricFuction = learnPredictMSE elif self.metric == "f1" or self.metric == "mrr": metricFuction = learnPredictRanking else: raise ValueError("Unknown metric: " + self.metric) for i, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(i, 1, len(cvInds), "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] #nptst.assert_array_almost_equal((testX+trainX).data, X.data) paramList = [] for m, k in enumerate(ks): learner = self.copy() learner.updateAlg = "initial" learner.setK(k) paramList.append((learner, trainX, testX, rhos)) if self.numProcesses != 1: pool = multiprocessing.Pool( processes=multiprocessing.cpu_count() / 2, maxtasksperchild=10) results = pool.imap(metricFuction, paramList) else: results = itertools.imap(metricFuction, paramList) for m, rhoErrors in enumerate(results): errors[:, m, i] = rhoErrors if self.numProcesses != 1: pool.terminate() meanMetrics = errors.mean(2) stdMetrics = errors.std(2) logging.debug(meanMetrics) #Set the parameters if self.metric == "mse": self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]]) elif self.metric == "f1" or self.metric == "mrr": self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]]) logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho)) return meanMetrics, stdMetrics
def modelSelect(self, X, rhos, ks, cvInds): """ Pick a value of rho based on a single matrix X. We do cross validation within, and return the best value of lambda (according to the mean squared error). The rhos must be in decreasing order and we use warm restarts. """ if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): raise ValueError("rhos must be in descending order") errors = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds))) if self.metric == "mse": metricFuction = learnPredictMSE elif self.metric == "f1" or self.metric == "mrr": metricFuction = learnPredictRanking else: raise ValueError("Unknown metric: " + self.metric) for i, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(i, 1, len(cvInds), "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] #nptst.assert_array_almost_equal((testX+trainX).data, X.data) paramList = [] for m, k in enumerate(ks): learner = self.copy() learner.updateAlg="initial" learner.setK(k) paramList.append((learner, trainX, testX, rhos)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) results = pool.imap(metricFuction, paramList) else: results = itertools.imap(metricFuction, paramList) for m, rhoErrors in enumerate(results): errors[:, m, i] = rhoErrors if self.numProcesses != 1: pool.terminate() meanMetrics = errors.mean(2) stdMetrics = errors.std(2) logging.debug(meanMetrics) #Set the parameters if self.metric == "mse": self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]]) elif self.metric == "f1" or self.metric == "mrr": self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]]) logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho)) return meanMetrics, stdMetrics
def runExperiment(self): """ Run the selected clustering experiments and save results """ if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") for svdAlg in self.algoArgs.svdAlgs: if svdAlg == "rsvd" or svdAlg == "rsvdUpdate" or svdAlg == "rsvdUpdate2": resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_p=" + str(self.algoArgs.p)+ "_q=" + str(self.algoArgs.q) + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" else: resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSoftImpute(svdAlg=svdAlg, logStep=self.logStep, kmax=self.algoArgs.kmax, postProcess=self.algoArgs.postProcess, weighted=self.algoArgs.weighted, p=self.algoArgs.p, q=self.algoArgs.q, verbose=self.algoArgs.verbose, updateAlg=self.algoArgs.updateAlg) if self.algoArgs.modelSelect: trainIterator = self.getTrainIterator() #Let's find the optimal lambda using the first matrix X = trainIterator.next() logging.debug("Performing model selection, taking subsample of entries of size " + str(self.sampleSize)) X = SparseUtils.submatrix(X, self.sampleSize) cvInds = Sampling.randCrossValidation(self.algoArgs.folds, X.nnz) meanErrors, stdErrors = learner.modelSelect(X, self.algoArgs.rhos, self.algoArgs.ks, cvInds) logging.debug("Mean errors = " + str(meanErrors)) logging.debug("Std errors = " + str(stdErrors)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) rho = self.algoArgs.rhos[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[0]] k = self.algoArgs.ks[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[1]] else: rho = self.algoArgs.rhos[0] k = self.algoArgs.ks[0] learner.setK(k) learner.setRho(rho) logging.debug(learner) trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runSgdMf: logging.debug("Running SGD MF") resultsFileName = self.resultsDir + "ResultsSgdMf.npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSGDNorm2Reg(k=self.algoArgs.ks[0], lmbda=self.algoArgs.lmbdas[0], gamma=self.algoArgs.gammas[0], eps=self.algoArgs.eps) if self.algoArgs.modelSelect: # Let's find optimal parameters using the first matrix learner.modelSelect(self.getTrainIterator().next(), self.algoArgs.ks, self.algoArgs.lmbdas, self.algoArgs.gammas, self.algoArgs.folds) trainIterator = self.getTrainIterator() trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX+trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append((trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array(list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape((nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin/(nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin%nLG)/nGamma))] gammaMin = gammas[int(scipy.floor(iMin%nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin/(nLG))), int(scipy.floor((iMin%nLG)/nGamma)), int(scipy.floor(iMin%nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return