def testPredict(self): #Create a set of indices lmbda = 0.0 iterativeSoftImpute = IterativeSoftImpute(lmbda, k=10) matrixIterator = iter(self.matrixList) ZList = iterativeSoftImpute.learnModel(matrixIterator) XhatList = iterativeSoftImpute.predict(ZList, self.indsList) #Check we get the exact matrices returned for i, Xhat in enumerate(XhatList): nptst.assert_array_almost_equal(numpy.array(Xhat.todense()), self.matrixList[i].todense()) self.assertEquals(Xhat.nnz, self.indsList[i].shape[0]) self.assertAlmostEquals(MCEvaluator.meanSqError(Xhat, self.matrixList[i]), 0) self.assertAlmostEquals(MCEvaluator.rootMeanSqError(Xhat, self.matrixList[i]), 0) #Try moderate lambda lmbda = 0.1 iterativeSoftImpute = IterativeSoftImpute(lmbda, k=10) matrixIterator = iter(self.matrixList) ZList = list(iterativeSoftImpute.learnModel(matrixIterator)) XhatList = iterativeSoftImpute.predict(iter(ZList), self.indsList) for i, Xhat in enumerate(XhatList): for ind in self.indsList[i]: U, s, V = ZList[i] Z = (U*s).dot(V.T) self.assertEquals(Xhat[numpy.unravel_index(ind, Xhat.shape)], Z[numpy.unravel_index(ind, Xhat.shape)]) self.assertEquals(Xhat.nnz, self.indsList[i].shape[0])
def testPostProcess(self): lmbda = 0.0 eps = 0.1 k = 20 matrixIterator = iter(self.matrixList) iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="rsvd", postProcess=True) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, numpy.array(self.matrixList[i].todense())) #Try case with iterativeSoftImpute.postProcessSamples < X.nnz matrixIterator = iter(self.matrixList) iterativeSoftImpute.postProcessSamples = int(self.matrixList[0].nnz/2) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, self.matrixList[i].todense(), 2) #Try for larger lambda iterativeSoftImpute.setRho(0.2) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T)
def initUV(self, X): m = X.shape[0] n = X.shape[1] if self.initialAlg == "rand": U = numpy.random.randn(m, self.k) * 0.1 V = numpy.random.randn(n, self.k) * 0.1 elif self.initialAlg == "svd": logging.debug("Initialising with Randomised SVD") U, s, V = RandomisedSVD.svd(X, self.k, self.p, self.q) U = U * s elif self.initialAlg == "softimpute": logging.debug("Initialising with softimpute") trainIterator = iter([X.toScipyCsc()]) rho = 0.01 learner = IterativeSoftImpute(rho, k=self.k, svdAlg="propack", postProcess=True) ZList = learner.learnModel(trainIterator) U, s, V = ZList.next() U = U * s elif self.initialAlg == "wrmf": logging.debug("Initialising with wrmf") learner = WeightedMf(self.k, w=self.w) U, V = learner.learnModel(X.toScipyCsr()) else: raise ValueError("Unknown initialisation: " + str(self.initialAlg)) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) return U, V
def initUV(self, X): m = X.shape[0] n = X.shape[1] if self.initialAlg == "rand": U = numpy.random.randn(m, self.k) * 0.1 V = numpy.random.randn(n, self.k) * 0.1 elif self.initialAlg == "svd": logging.debug("Initialising with Randomised SVD") U, s, V = RandomisedSVD.svd(X, self.k, self.p, self.q) U = U * s elif self.initialAlg == "softimpute": logging.debug("Initialising with softimpute") trainIterator = iter([X.toScipyCsc()]) rho = 0.01 learner = IterativeSoftImpute(rho, k=self.k, svdAlg="propack", postProcess=True) ZList = learner.learnModel(trainIterator) U, s, V = ZList.next() U = U * s elif self.initialAlg == "wrmf": logging.debug("Initialising with wrmf") learner = WeightedMf(self.k, w=self.w) U, V = learner.learnModel(X.toScipyCsr()) else: raise ValueError("Unknown initialisation: " + str(self.initialAlg)) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) return U, V
def testWeightedLearning(self): #See if the weighted learning has any effect shape = (20, 20) r = 20 numInds = 100 noise = 0.2 X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise) rho = 0.0 iterativeSoftImpute = IterativeSoftImpute(rho, k=10, weighted=True) iterX = iter([X]) resultIter = iterativeSoftImpute.learnModel(iterX) Z = resultIter.next() iterativeSoftImpute = IterativeSoftImpute(rho, k=10, weighted=False) iterX = iter([X]) resultIter = iterativeSoftImpute.learnModel(iterX) Z2 = resultIter.next() #Check results when rho=0 nptst.assert_array_almost_equal((Z[0]*Z[1]).dot(Z[2].T), (Z2[0]*Z2[1]).dot(Z2[2].T)) nptst.assert_array_almost_equal(Z[1], Z2[1]) #Then check non-uniform matrix - entries clustered around middle indices shape = (20, 15) numInds = 200 maxInd = (shape[0]*shape[1]-1) nzInds = numpy.array(numpy.random.randn(numInds)*maxInd/4 + maxInd/2, numpy.int) trainInds = nzInds[0:int(nzInds.shape[0]/2)] testInds = nzInds[int(nzInds.shape[0]/2):] trainInds = numpy.unique(numpy.clip(trainInds, 0, maxInd)) testInds = numpy.unique(numpy.clip(testInds, 0, maxInd)) trainX = ExpSU.SparseUtils.generateSparseLowRank(shape, r, trainInds, noise) testX = ExpSU.SparseUtils.generateSparseLowRank(shape, r, testInds, noise) #Error using weighted soft impute #print("Running weighted soft impute") rho = 0.5 iterativeSoftImpute = IterativeSoftImpute(rho, k=10, weighted=True) iterX = iter([trainX]) resultIter = iterativeSoftImpute.learnModel(iterX) Z = resultIter.next() iterTestX = iter([testX]) predX = iterativeSoftImpute.predictOne(Z, testX.nonzero()) error = MCEvaluator.rootMeanSqError(testX, predX) #print(error) iterativeSoftImpute = IterativeSoftImpute(rho, k=10, weighted=False) iterX = iter([trainX]) resultIter = iterativeSoftImpute.learnModel(iterX) Z = resultIter.next() iterTestX = iter([testX]) predX = iterativeSoftImpute.predictOne(Z, testX.nonzero()) error = MCEvaluator.rootMeanSqError(testX, predX)
def testLearnModel2(self): #Test the SVD updating solution in the case where we get an exact solution lmbda = 0.0 eps = 0.1 k = 20 matrixIterator = iter(self.matrixList) iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="rsvd") ZList = iterativeSoftImpute.learnModel(matrixIterator) #Check that ZList is the same as XList for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, self.matrixList[i].todense()) #Compare solution with that of SoftImpute class rhoList = [0.1, 0.2, 0.5, 1.0] for rho in rhoList: iterativeSoftImpute = IterativeSoftImpute(rho, k=k, eps=eps, svdAlg="rsvd", updateAlg="zero") matrixIterator = iter(self.matrixList) ZList = iterativeSoftImpute.learnModel(matrixIterator) rhos = numpy.array([rho]) softImpute = SoftImpute(rhos, k=k, eps=eps) Z1 = softImpute.learnModel(self.matrixList[0]) Z2 = softImpute.learnModel(self.matrixList[1]) Z3 = softImpute.learnModel(self.matrixList[2]) ZList2 = [Z1, Z2, Z3] for j, Zhat in enumerate(ZList): U, s, V = Zhat Z = (U*s).dot(V.T) nptst.assert_array_almost_equal(Z, ZList2[j].todense()) #Also test with true solution Z = S_lambda(X + Z^\bot_\omega) Zomega = numpy.zeros(self.matrixList[j].shape) rowInds, colInds = self.matrixList[j].nonzero() for i in range(self.matrixList[j].nonzero()[0].shape[0]): Zomega[rowInds[i], colInds[i]] = Z[rowInds[i], colInds[i]] U, s, V = ExpSU.SparseUtils.svdArpack(self.matrixList[j], 1, kmax=20) lmbda = rho*numpy.max(s) U, s, V = ExpSU.SparseUtils.svdSoft(numpy.array(self.matrixList[j]-Zomega+Z), lmbda) tol = 0.1 self.assertTrue(numpy.linalg.norm(Z -(U*s).dot(V.T))**2 < tol)
def testLearnModel(self): lmbda = 0.0 eps = 0.1 k = 10 matrixIterator = iter(self.matrixList) iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="propack") ZList = iterativeSoftImpute.learnModel(matrixIterator) #Check that ZList is the same as XList for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, numpy.array(self.matrixList[i].todense())) #Compare solution with that of SoftImpute class lmbdaList = [0.1, 0.2, 0.5, 1.0] for lmbda in lmbdaList: iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="propack", updateAlg="zero") matrixIterator = iter(self.matrixList) ZList = iterativeSoftImpute.learnModel(matrixIterator) lmbdas = numpy.array([lmbda]) softImpute = SoftImpute(lmbdas, k=k, eps=eps) Z1 = softImpute.learnModel(self.matrixList[0]) Z2 = softImpute.learnModel(self.matrixList[1]) Z3 = softImpute.learnModel(self.matrixList[2]) ZList2 = [Z1, Z2, Z3] for j, Zhat in enumerate(ZList): U, s, V = Zhat Z = (U*s).dot(V.T) nptst.assert_array_almost_equal(Z, ZList2[j].todense()) #Also test with true solution Z = S_lambda(X + Z^\bot_\omega) Zomega = numpy.zeros(self.matrixList[j].shape) rowInds, colInds = self.matrixList[j].nonzero() for i in range(self.matrixList[j].nonzero()[0].shape[0]): Zomega[rowInds[i], colInds[i]] = Z[rowInds[i], colInds[i]] U, s, V = ExpSU.SparseUtils.svdSoft(numpy.array(self.matrixList[j]-Zomega+Z), lmbda) tol = 0.1 self.assertTrue(numpy.linalg.norm(Z -(U*s).dot(V.T))**2 < tol)
def testModelSelect2(self): rho = 0.1 shape = (20, 20) r = 20 numInds = 100 noise = 0.2 X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise) X = X.tocsc() U, s, V = numpy.linalg.svd(X.todense()) k = 15 iterativeSoftImpute = IterativeSoftImpute(rho, k=None, svdAlg="propack", updateAlg="initial") rhos = numpy.linspace(0.5, 0.001, 5) ks = numpy.array([5, 10, 15], numpy.int) folds = 3 cvInds = [] for i in range(folds): cvInds.append((numpy.arange(X.nnz), numpy.arange(X.nnz))) meanTestErrors, stdTestErrors = iterativeSoftImpute.modelSelect(X, rhos, ks, cvInds) self.assertAlmostEquals(numpy.linalg.norm(stdTestErrors), 0, 3) meanTestErrors2 = numpy.zeros((rhos.shape[0], ks.shape[0])) #Now compute errors manually for j, k in enumerate(ks): iterativeSoftImpute.setK(k) for i, rho in enumerate(rhos): iterativeSoftImpute.setRho(rho) ZIter = iterativeSoftImpute.learnModel(iter([X])) indList = [X.nonzero()] outIterator = iterativeSoftImpute.predict(ZIter, indList) Xhat = outIterator.next() meanTestErrors2[i, j] = MCEvaluator.rootMeanSqError(X, Xhat) nptst.assert_array_almost_equal(meanTestErrors, meanTestErrors2, 2)
def runExperiment(self): """ Run the selected clustering experiments and save results """ if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") for svdAlg in self.algoArgs.svdAlgs: if svdAlg == "rsvd" or svdAlg == "rsvdUpdate" or svdAlg == "rsvdUpdate2": resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_p=" + str(self.algoArgs.p)+ "_q=" + str(self.algoArgs.q) + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" else: resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSoftImpute(svdAlg=svdAlg, logStep=self.logStep, kmax=self.algoArgs.kmax, postProcess=self.algoArgs.postProcess, weighted=self.algoArgs.weighted, p=self.algoArgs.p, q=self.algoArgs.q, verbose=self.algoArgs.verbose, updateAlg=self.algoArgs.updateAlg) if self.algoArgs.modelSelect: trainIterator = self.getTrainIterator() #Let's find the optimal lambda using the first matrix X = trainIterator.next() logging.debug("Performing model selection, taking subsample of entries of size " + str(self.sampleSize)) X = SparseUtils.submatrix(X, self.sampleSize) cvInds = Sampling.randCrossValidation(self.algoArgs.folds, X.nnz) meanErrors, stdErrors = learner.modelSelect(X, self.algoArgs.rhos, self.algoArgs.ks, cvInds) logging.debug("Mean errors = " + str(meanErrors)) logging.debug("Std errors = " + str(stdErrors)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) rho = self.algoArgs.rhos[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[0]] k = self.algoArgs.ks[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[1]] else: rho = self.algoArgs.rhos[0] k = self.algoArgs.ks[0] learner.setK(k) learner.setRho(rho) logging.debug(learner) trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runSgdMf: logging.debug("Running SGD MF") resultsFileName = self.resultsDir + "ResultsSgdMf.npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSGDNorm2Reg(k=self.algoArgs.ks[0], lmbda=self.algoArgs.lmbdas[0], gamma=self.algoArgs.gammas[0], eps=self.algoArgs.eps) if self.algoArgs.modelSelect: # Let's find optimal parameters using the first matrix learner.modelSelect(self.getTrainIterator().next(), self.algoArgs.ks, self.algoArgs.lmbdas, self.algoArgs.gammas, self.algoArgs.folds) trainIterator = self.getTrainIterator() trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")