def testPostProcess(self): lmbda = 0.0 eps = 0.1 k = 20 matrixIterator = iter(self.matrixList) iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="rsvd", postProcess=True) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, numpy.array(self.matrixList[i].todense())) #Try case with iterativeSoftImpute.postProcessSamples < X.nnz matrixIterator = iter(self.matrixList) iterativeSoftImpute.postProcessSamples = int(self.matrixList[0].nnz/2) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, self.matrixList[i].todense(), 2) #Try for larger lambda iterativeSoftImpute.setRho(0.2) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T)
def testModelSelect2(self): rho = 0.1 shape = (20, 20) r = 20 numInds = 100 noise = 0.2 X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise) X = X.tocsc() U, s, V = numpy.linalg.svd(X.todense()) k = 15 iterativeSoftImpute = IterativeSoftImpute(rho, k=None, svdAlg="propack", updateAlg="initial") rhos = numpy.linspace(0.5, 0.001, 5) ks = numpy.array([5, 10, 15], numpy.int) folds = 3 cvInds = [] for i in range(folds): cvInds.append((numpy.arange(X.nnz), numpy.arange(X.nnz))) meanTestErrors, stdTestErrors = iterativeSoftImpute.modelSelect(X, rhos, ks, cvInds) self.assertAlmostEquals(numpy.linalg.norm(stdTestErrors), 0, 3) meanTestErrors2 = numpy.zeros((rhos.shape[0], ks.shape[0])) #Now compute errors manually for j, k in enumerate(ks): iterativeSoftImpute.setK(k) for i, rho in enumerate(rhos): iterativeSoftImpute.setRho(rho) ZIter = iterativeSoftImpute.learnModel(iter([X])) indList = [X.nonzero()] outIterator = iterativeSoftImpute.predict(ZIter, indList) Xhat = outIterator.next() meanTestErrors2[i, j] = MCEvaluator.rootMeanSqError(X, Xhat) nptst.assert_array_almost_equal(meanTestErrors, meanTestErrors2, 2)
def runExperiment(self): """ Run the selected clustering experiments and save results """ if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") for svdAlg in self.algoArgs.svdAlgs: if svdAlg == "rsvd" or svdAlg == "rsvdUpdate" or svdAlg == "rsvdUpdate2": resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_p=" + str(self.algoArgs.p)+ "_q=" + str(self.algoArgs.q) + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" else: resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSoftImpute(svdAlg=svdAlg, logStep=self.logStep, kmax=self.algoArgs.kmax, postProcess=self.algoArgs.postProcess, weighted=self.algoArgs.weighted, p=self.algoArgs.p, q=self.algoArgs.q, verbose=self.algoArgs.verbose, updateAlg=self.algoArgs.updateAlg) if self.algoArgs.modelSelect: trainIterator = self.getTrainIterator() #Let's find the optimal lambda using the first matrix X = trainIterator.next() logging.debug("Performing model selection, taking subsample of entries of size " + str(self.sampleSize)) X = SparseUtils.submatrix(X, self.sampleSize) cvInds = Sampling.randCrossValidation(self.algoArgs.folds, X.nnz) meanErrors, stdErrors = learner.modelSelect(X, self.algoArgs.rhos, self.algoArgs.ks, cvInds) logging.debug("Mean errors = " + str(meanErrors)) logging.debug("Std errors = " + str(stdErrors)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) rho = self.algoArgs.rhos[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[0]] k = self.algoArgs.ks[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[1]] else: rho = self.algoArgs.rhos[0] k = self.algoArgs.ks[0] learner.setK(k) learner.setRho(rho) logging.debug(learner) trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runSgdMf: logging.debug("Running SGD MF") resultsFileName = self.resultsDir + "ResultsSgdMf.npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSGDNorm2Reg(k=self.algoArgs.ks[0], lmbda=self.algoArgs.lmbdas[0], gamma=self.algoArgs.gammas[0], eps=self.algoArgs.eps) if self.algoArgs.modelSelect: # Let's find optimal parameters using the first matrix learner.modelSelect(self.getTrainIterator().next(), self.algoArgs.ks, self.algoArgs.lmbdas, self.algoArgs.gammas, self.algoArgs.folds) trainIterator = self.getTrainIterator() trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")