def testCentreRows(self): shape = (50, 10) r = 5 k = 100 X, U, s, V = SparseUtils.generateSparseLowRank(shape, r, k, verbose=True) rowInds, colInds = X.nonzero() for i in range(rowInds.shape[0]): self.assertEquals(X[rowInds[i], colInds[i]], numpy.array(X[X.nonzero()]).ravel()[i]) mu2 = numpy.array(X.sum(1)).ravel() numNnz = numpy.zeros(X.shape[0]) for i in range(X.shape[0]): for j in range(X.shape[1]): if X[i,j]!=0: numNnz[i] += 1 mu2 /= numNnz mu2[numNnz==0] = 0 X, mu = SparseUtils.centerRows(X) nptst.assert_array_almost_equal(numpy.array(X.mean(1)).ravel(), numpy.zeros(X.shape[0])) nptst.assert_array_almost_equal(mu, mu2)
def testSvdSoft(self): A = scipy.sparse.rand(10, 10, 0.2) A = A.tocsc() lmbda = 0.2 U, s, V = SparseUtils.svdSoft(A, lmbda) ATilde = U.dot(numpy.diag(s)).dot(V.T) #Now compute the same matrix using numpy A = A.todense() U2, s2, V2 = numpy.linalg.svd(A) inds = numpy.flipud(numpy.argsort(s2)) inds = inds[s2[inds] > lmbda] U2, s2, V2 = Util.indSvd(U2, s2, V2, inds) s2 = s2 - lmbda s2 = numpy.clip(s, 0, numpy.max(s2)) ATilde2 = U2.dot(numpy.diag(s2)).dot(V2.T) nptst.assert_array_almost_equal(s, s) nptst.assert_array_almost_equal(ATilde, ATilde2) #Now run svdSoft with a numpy array U3, s3, V3 = SparseUtils.svdSoft(A, lmbda) ATilde3 = U.dot(numpy.diag(s)).dot(V.T) nptst.assert_array_almost_equal(s, s3) nptst.assert_array_almost_equal(ATilde3, ATilde2)
def testMatrixApprox(self): tol = 10**-6 A = numpy.random.rand(10, 10) A = A.dot(A.T) n = 5 inds = numpy.sort(numpy.random.permutation(A.shape[0])[0:n]) AHat = Nystrom.matrixApprox(A, inds) n = 10 AHat2 = Nystrom.matrixApprox(A, n) self.assertTrue( numpy.linalg.norm(A - AHat2) < numpy.linalg.norm(A - AHat)) self.assertTrue(numpy.linalg.norm(A - AHat2) < tol) #Test on a sparse matrix As = scipy.sparse.csr_matrix(A) n = 5 inds = numpy.sort(numpy.random.permutation(A.shape[0])[0:n]) AHat = Nystrom.matrixApprox(As, inds) n = 10 AHat2 = Nystrom.matrixApprox(As, n) self.assertTrue( SparseUtils.norm(As - AHat2) < SparseUtils.norm(As - AHat)) self.assertTrue(SparseUtils.norm(As - AHat2) < tol) #Compare dense and sparse solutions for n in range(1, 9): inds = numpy.sort(numpy.random.permutation(A.shape[0])[0:n]) AHats = Nystrom.matrixApprox(As, inds) AHat = Nystrom.matrixApprox(A, inds) self.assertTrue( numpy.linalg.norm(AHat - numpy.array(AHats.todense())) < tol)
def testSampleUsers(self): m = 10 n = 15 r = 5 u = 0.3 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) k = 50 X2, userInds = Sampling.sampleUsers(X, k) nptst.assert_array_equal(X.toarray(), X2.toarray()) numRuns = 50 for i in range(numRuns): m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) k = numpy.random.randint(10, 100) X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) X2, userInds = Sampling.sampleUsers(X, k) self.assertEquals(X2.shape[0], min(k, m)) self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all())) self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all()) self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
def testLocalAucApprox(self): m = 100 n = 200 k = 2 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True, verbose=True) w = 1.0 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(150, 200, 10) for i, sampleSize in enumerate(samples): numAucSamples = sampleSize localAuc2 = MCEvaluator.localAUCApprox(SparseUtils.getOmegaListPtr(X), U, V, w, numAucSamples) self.assertAlmostEqual(localAuc2, localAuc, 1) # Try smaller w w = 0.5 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(50, 200, 10) for i, sampleSize in enumerate(samples): numAucSamples = sampleSize localAuc2 = MCEvaluator.localAUCApprox(SparseUtils.getOmegaListPtr(X), U, V, w, numAucSamples) self.assertAlmostEqual(localAuc2, localAuc, 1)
def testMatrixApprox(self): tol = 10**-6 A = numpy.random.rand(10, 10) A = A.dot(A.T) n = 5 inds = numpy.sort(numpy.random.permutation(A.shape[0])[0:n]) AHat = Nystrom.matrixApprox(A, inds) n = 10 AHat2 = Nystrom.matrixApprox(A, n) self.assertTrue(numpy.linalg.norm(A - AHat2) < numpy.linalg.norm(A - AHat)) self.assertTrue(numpy.linalg.norm(A - AHat2) < tol) #Test on a sparse matrix As = scipy.sparse.csr_matrix(A) n = 5 inds = numpy.sort(numpy.random.permutation(A.shape[0])[0:n]) AHat = Nystrom.matrixApprox(As, inds) n = 10 AHat2 = Nystrom.matrixApprox(As, n) self.assertTrue(SparseUtils.norm(As - AHat2) < SparseUtils.norm(As - AHat)) self.assertTrue(SparseUtils.norm(As - AHat2) < tol) #Compare dense and sparse solutions for n in range(1, 9): inds = numpy.sort(numpy.random.permutation(A.shape[0])[0:n]) AHats = Nystrom.matrixApprox(As, inds) AHat = Nystrom.matrixApprox(A, inds) self.assertTrue(numpy.linalg.norm(AHat - numpy.array(AHats.todense())) < tol)
def testSplitNnz(self): numRuns = 100 import sppy for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) X = scipy.sparse.rand(m, n, 0.5) X = X.tocsc() split = numpy.random.rand() X1, X2 = SparseUtils.splitNnz(X, split) nptst.assert_array_almost_equal((X1+X2).todense(), X.todense()) for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) X = scipy.sparse.rand(m, n, 0.5) X = X.tocsc() X = sppy.csarray(X) split = numpy.random.rand() X1, X2 = SparseUtils.splitNnz(X, split) nptst.assert_array_almost_equal((X1+X2).toarray(), X.toarray())
def testSplitNnz(self): numRuns = 100 import sppy for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) X = scipy.sparse.rand(m, n, 0.5) X = X.tocsc() split = numpy.random.rand() X1, X2 = SparseUtils.splitNnz(X, split) nptst.assert_array_almost_equal((X1 + X2).todense(), X.todense()) for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) X = scipy.sparse.rand(m, n, 0.5) X = X.tocsc() X = sppy.csarray(X) split = numpy.random.rand() X1, X2 = SparseUtils.splitNnz(X, split) nptst.assert_array_almost_equal((X1 + X2).toarray(), X.toarray())
def testSparseMatrix(self): m = 10 n = 15 A = numpy.random.rand(m, n) rowInds, colInds = A.nonzero() vals = A[rowInds, colInds] X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "scipy", storagetype="col") self.assertTrue(X.dtype==A.dtype) self.assertTrue(X.shape==A.shape) self.assertTrue(type(X)== scipy.sparse.csc_matrix) nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "scipy", storagetype="row") self.assertTrue(X.dtype==A.dtype) self.assertTrue(X.shape==A.shape) self.assertTrue(type(X)== scipy.sparse.csr_matrix) nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "csarray", storagetype="col") self.assertTrue(X.dtype==A.dtype) self.assertTrue(X.shape==A.shape) self.assertTrue(type(X)== sppy.csarray) self.assertTrue(X.storagetype=="col") nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "csarray", storagetype="row") self.assertTrue(X.dtype==A.dtype) self.assertTrue(X.shape==A.shape) self.assertTrue(type(X)== sppy.csarray) self.assertTrue(X.storagetype=="row") nptst.assert_array_equal(X.toarray(), A)
def testGetOmegaListPtr(self): import sppy m = 10 n = 5 X = scipy.sparse.rand(m, n, 0.1) X = X.tocsr() indPtr, colInds = SparseUtils.getOmegaListPtr(X) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i+1]] nptst.assert_array_almost_equal(omegai, X.toarray()[i, :].nonzero()[0]) Xsppy = sppy.csarray(X) indPtr, colInds = SparseUtils.getOmegaListPtr(Xsppy) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i+1]] nptst.assert_array_almost_equal(omegai, X.toarray()[i, :].nonzero()[0]) #Test a zero array (scipy doesn't work in this case) X = sppy.csarray((m,n)) indPtr, colInds = SparseUtils.getOmegaListPtr(X) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i+1]]
def testCentreCols(self): shape = (50, 10) r = 5 k = 100 X, U, s, V = SparseUtils.generateSparseLowRank(shape, r, k, verbose=True) rowInds, colInds = X.nonzero() mu2 = numpy.array(X.sum(0)).ravel() numNnz = numpy.zeros(X.shape[1]) for i in range(X.shape[0]): for j in range(X.shape[1]): if X[i, j] != 0: numNnz[j] += 1 mu2 /= numNnz mu2[numNnz == 0] = 0 X, mu = SparseUtils.centerCols(X) nptst.assert_array_almost_equal( numpy.array(X.mean(0)).ravel(), numpy.zeros(X.shape[1])) nptst.assert_array_almost_equal(mu, mu2)
def testLocalAucApprox(self): m = 100 n = 200 k = 2 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True, verbose=True) w = 1.0 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(150, 200, 10) for i, sampleSize in enumerate(samples): numAucSamples = sampleSize localAuc2 = MCEvaluator.localAUCApprox( SparseUtils.getOmegaListPtr(X), U, V, w, numAucSamples) self.assertAlmostEqual(localAuc2, localAuc, 1) #Try smaller w w = 0.5 localAuc = MCEvaluator.localAUC(X, U, V, w) samples = numpy.arange(50, 200, 10) for i, sampleSize in enumerate(samples): numAucSamples = sampleSize localAuc2 = MCEvaluator.localAUCApprox( SparseUtils.getOmegaListPtr(X), U, V, w, numAucSamples) self.assertAlmostEqual(localAuc2, localAuc, 1)
def testGetOmegaListPtr(self): import sppy m = 10 n = 5 X = scipy.sparse.rand(m, n, 0.1) X = X.tocsr() indPtr, colInds = SparseUtils.getOmegaListPtr(X) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i + 1]] nptst.assert_array_almost_equal(omegai, X.toarray()[i, :].nonzero()[0]) Xsppy = sppy.csarray(X) indPtr, colInds = SparseUtils.getOmegaListPtr(Xsppy) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i + 1]] nptst.assert_array_almost_equal(omegai, X.toarray()[i, :].nonzero()[0]) #Test a zero array (scipy doesn't work in this case) X = sppy.csarray((m, n)) indPtr, colInds = SparseUtils.getOmegaListPtr(X) for i in range(m): omegai = colInds[indPtr[i]:indPtr[i + 1]]
def testScale(self): """ Look at the scales of the unnormalised gradients. """ m = 100 n = 400 k = 3 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) w = 0.1 eps = 0.001 learner = MaxAUCTanh(k, w) learner.normalise = False learner.lmbdaU = 1.0 learner.lmbdaV = 1.0 learner.rho = 1.0 learner.numAucSamples = 100 indPtr, colInds = SparseUtils.getOmegaListPtr(X) r = numpy.random.rand(m) U = numpy.random.rand(X.shape[0], k) V = numpy.random.rand(X.shape[1], k) gi = numpy.random.rand(m) gi /= gi.sum() gp = numpy.random.rand(n) gp /= gp.sum() gq = numpy.random.rand(n) gq /= gq.sum() permutedRowInds = numpy.array(numpy.random.permutation(m), numpy.uint32) permutedColInds = numpy.array(numpy.random.permutation(n), numpy.uint32) maxLocalAuc = MaxLocalAUC(k, w) normGp, normGq = maxLocalAuc.computeNormGpq(indPtr, colInds, gp, gq, m) normDui = 0 for i in range(m): du = learner.derivativeUi(indPtr, colInds, U, V, r, gi, gp, gq, i) normDui += numpy.linalg.norm(du) normDui /= float(m) print(normDui) normDvi = 0 for i in range(n): dv = learner.derivativeVi(indPtr, colInds, U, V, r, gi, gp, gq, i) normDvi += numpy.linalg.norm(dv) normDvi /= float(n) print(normDvi)
def testReconstructLowRank(self): shape = (5000, 1000) r = 5 U, s, V = SparseUtils.generateLowRank(shape, r) inds = numpy.array([0]) X = SparseUtils.reconstructLowRank(U, s, V, inds) self.assertAlmostEquals(X[0, 0], (U[0, :]*s).dot(V[0, :]))
def testReconstructLowRank(self): shape = (5000, 1000) r = 5 U, s, V = SparseUtils.generateLowRank(shape, r) inds = numpy.array([0]) X = SparseUtils.reconstructLowRank(U, s, V, inds) self.assertAlmostEquals(X[0, 0], (U[0, :] * s).dot(V[0, :]))
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def testSparseMatrix(self): m = 10 n = 15 A = numpy.random.rand(m, n) rowInds, colInds = A.nonzero() vals = A[rowInds, colInds] X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "scipy", storagetype="col") self.assertTrue(X.dtype == A.dtype) self.assertTrue(X.shape == A.shape) self.assertTrue(type(X) == scipy.sparse.csc_matrix) nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "scipy", storagetype="row") self.assertTrue(X.dtype == A.dtype) self.assertTrue(X.shape == A.shape) self.assertTrue(type(X) == scipy.sparse.csr_matrix) nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "csarray", storagetype="col") self.assertTrue(X.dtype == A.dtype) self.assertTrue(X.shape == A.shape) self.assertTrue(type(X) == sppy.csarray) self.assertTrue(X.storagetype == "col") nptst.assert_array_equal(X.toarray(), A) X = SparseUtils.sparseMatrix(vals, rowInds, colInds, A.shape, "csarray", storagetype="row") self.assertTrue(X.dtype == A.dtype) self.assertTrue(X.shape == A.shape) self.assertTrue(type(X) == sppy.csarray) self.assertTrue(X.storagetype == "row") nptst.assert_array_equal(X.toarray(), A)
def learnModel2(self, X): """ Learn the matrix completion using a sparse matrix X. This is the simple version of the soft impute algorithm in which we store the entire matrices, newZ and oldZ. """ #if not scipy.sparse.isspmatrix_lil(X): # raise ValueError("Input matrix must be lil_matrix") oldZ = scipy.sparse.lil_matrix(X.shape) omega = X.nonzero() tol = 10**-6 ZList = [] for rho in self.rhos: gamma = self.eps + 1 i = 0 while gamma > self.eps: Y = oldZ.copy() Y[omega] = 0 Y = X + Y Y = Y.tocsc() U, s, V = ExpSU.SparseUtils.svdSoft(Y, rho) #Get an "invalid value encountered in sqrt" warning sometimes newZ = scipy.sparse.lil_matrix((U*s).dot(V.T)) oldZ = oldZ.tocsr() normOldZ = SparseUtils.norm(oldZ)**2 normNewZmOldZ = SparseUtils.norm(newZ - oldZ)**2 #We can get newZ == oldZ in which case we break if normNewZmOldZ < tol: gamma = 0 elif abs(normOldZ) < tol: gamma = self.eps + 1 else: gamma = normNewZmOldZ/normOldZ oldZ = newZ.copy() logging.debug("Iteration " + str(i) + " gamma="+str(gamma)) i += 1 logging.debug("Number of iterations for lambda="+str(rho) + ": " + str(i)) ZList.append(newZ) if self.rhos.shape[0] != 1: return ZList else: return ZList[0]
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def syntheticDataset1(m=500, n=200, k=8, u=0.1, sd=0, noise=5): """ Create a simple synthetic dataset """ w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, sd=sd, csarray=True, verbose=True, indsPerRow=200) X = X + sppy.rand((m, n), noise/float(n), storagetype="row") X[X.nonzero()] = 1 X.prune() X = SparseUtils.pruneMatrixRows(X, minNnzRows=10) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) U = U*s return X, U, V
def learnPredictRanking(args): """ A function to train on a training set and test on a test set, for a number of values of rho. """ learner, trainX, testX, rhos = args logging.debug("k=" + str(learner.getK())) logging.debug(learner) testInds = testX.nonzero() trainXIter = [] testIndList = [] for rho in rhos: trainXIter.append(trainX) testIndList.append(testInds) trainXIter = iter(trainXIter) ZIter = learner.learnModel(trainXIter, iter(rhos)) metrics = numpy.zeros(rhos.shape[0]) for j, Z in enumerate(ZIter): U, s, V = Z U = U * s U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) testOrderedItems = MCEvaluatorCython.recommendAtk( U, V, learner.recommendSize, trainX) if learner.metric == "mrr": metrics[j] = MCEvaluator.mrrAtK(SparseUtils.getOmegaListPtr(testX), testOrderedItems, learner.recommendSize) logging.debug("MRR@" + str(learner.recommendSize) + ": " + str('%.4f' % metrics[j]) + " " + str(learner)) elif learner.metric == "f1": metrics[j] = MCEvaluator.mrrAtK(SparseUtils.getOmegaListPtr(testX), testOrderedItems, learner.recommendSize) logging.debug("F1@" + str(learner.recommendSize) + ": " + str('%.4f' % metrics[j]) + " " + str(learner)) else: raise ValueError("Unknown metric " + learner.metric) gc.collect() return metrics
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) precisions = numpy.zeros((self.ks.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): learner = self.copy() learner.k = k paramList.append((trainX, testX, testOmegaList, learner)) #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize) import itertools resultsIterator = itertools.imap(computePrecision, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempPrecision = resultsIterator.next() precisions[i, icv] = tempPrecision #pool.terminate() meanPrecisions = numpy.mean(precisions, 1) stdPrecisions = numpy.std(precisions, 1) logging.debug(meanPrecisions) k = self.ks[numpy.argmax(meanPrecisions)] logging.debug("Model parameters: k=" + str(k)) self.k = k return meanPrecisions, stdPrecisions
def _addSparseRSVD(U, s, V, X, k=10, kX=None, kRand=None, q=None): """ Perform a randomised SVD of the matrix X + U diag(s) V.T. We use th """ if kX==None: kX=k if kRand==None: kRand=k if q==None: q=1 m, n = X.shape Us = U*s kX = numpy.min([m, n, kX]) UX, sX, VX = SparseUtils.svdPropack(X, kX) omega = numpy.c_[V, VX, numpy.random.randn(n, kRand)] def rMultA(x): return Us.dot(V.T.dot(x)) + X.dot(x) def rMultAT(x): return V.dot(Us.T.dot(x)) + X.T.dot(x) Y = rMultA(omega) for i in range(q): Y = rMultAT(Y) Y = rMultA(Y) Q, R = numpy.linalg.qr(Y) B = rMultAT(Q).T U, s, VT = numpy.linalg.svd(B, full_matrices=False) U, s, V = Util.indSvd(U, s, VT, numpy.flipud(numpy.argsort(s))[:k]) U = Q.dot(U) return U, s, V
def testOverfit(self): """ See if we can get a zero objective on the hinge loss """ m = 10 n = 20 k = 5 u = 0.5 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) eps = 0.001 k = 10 maxLocalAuc = MaxLocalAUC(k, u, eps=eps, stochastic=True) maxLocalAuc.rate = "constant" maxLocalAuc.maxIterations = 500 maxLocalAuc.numProcesses = 1 maxLocalAuc.loss = "hinge" maxLocalAuc.validationUsers = 0 maxLocalAuc.lmbda = 0 print("Overfit example") U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel(X, verbose=True) self.assertAlmostEquals(trainMeasures[-1, 0], 0, 3)
def profileDerivativeUiApprox(self): k = 10 U = numpy.random.rand(self.m, k) V = numpy.random.rand(self.n, k) indPtr, colInds = SparseUtils.getOmegaListPtr(self.X) gp = numpy.random.rand(self.n) gp /= gp.sum() gq = numpy.random.rand(self.n) gq /= gq.sum() j = 3 numRowSamples = 100 numAucSamples = 10 permutedRowInds = numpy.array(numpy.random.permutation(self.m), numpy.uint32) permutedColInds = numpy.array(numpy.random.permutation(self.n), numpy.uint32) maxLocalAuc = MaxLocalAUC(k, w=0.9) normGp, normGq = maxLocalAuc.computeNormGpq(indPtr, colInds, gp, gq, self.m) lmbda = 0.001 normalise = True learner = MaxLocalAUCCython() def run(): numRuns = 10 for j in range(numRuns): for i in range(self.m): learner.derivativeUiApprox(indPtr, colInds, U, V, gp, gq, permutedColInds, i) ProfileUtils.profile("run()", globals(), locals())
def profileObjective(self): k = 10 U = numpy.random.rand(self.m, k) V = numpy.random.rand(self.n, k) indPtr, colInds = SparseUtils.getOmegaListPtr(self.X) colIndsProbabilities = numpy.ones(colInds.shape[0]) for i in range(self.m): colIndsProbabilities[indPtr[i] : indPtr[i + 1]] /= colIndsProbabilities[indPtr[i] : indPtr[i + 1]].sum() colIndsProbabilities[indPtr[i] : indPtr[i + 1]] = numpy.cumsum( colIndsProbabilities[indPtr[i] : indPtr[i + 1]] ) r = numpy.zeros(self.m) lmbda = 0.001 rho = 1.0 numAucSamples = 100 def run(): numRuns = 10 for i in range(numRuns): objectiveApprox(indPtr, colInds, indPtr, colInds, U, V, r, numAucSamples, lmbda, rho, False) ProfileUtils.profile("run()", globals(), locals())
def testParallelLearnModel(self): numpy.random.seed(21) m = 500 n = 200 k = 5 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True) from wallhack.rankingexp.DatasetUtils import DatasetUtils X, U, V = DatasetUtils.syntheticDataset1() u = 0.1 w = 1-u eps = 0.05 maxLocalAuc = MaxLocalAUC(k, w, alpha=1.0, eps=eps, stochastic=True) maxLocalAuc.maxIterations = 3 maxLocalAuc.recordStep = 1 maxLocalAuc.rate = "optimal" maxLocalAuc.t0 = 2.0 maxLocalAuc.validationUsers = 0.0 maxLocalAuc.numProcesses = 4 os.system('taskset -p 0xffffffff %d' % os.getpid()) print(X.nnz/maxLocalAuc.numAucSamples) U, V = maxLocalAuc.parallelLearnModel(X)
def testLocalAUC(self): m = 10 n = 20 k = 2 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, 0.5, verbose=True, csarray=True) Z = U.dot(V.T) localAuc = numpy.zeros(m) for i in range(m): localAuc[i] = sklearn.metrics.roc_auc_score(numpy.ravel(X[i, :].toarray()), Z[i, :]) localAuc = localAuc.mean() u = 0.0 localAuc2 = MCEvaluator.localAUC(X, U, V, u) self.assertEquals(localAuc, localAuc2) # Now try a large r w = 1.0 localAuc2 = MCEvaluator.localAUC(X, U, V, w) self.assertEquals(localAuc2, 0)
def testOverfit(self): """ See if we can get a zero objective on the hinge loss """ m = 10 n = 20 k = 5 u = 0.5 w = 1 - u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True) eps = 0.001 k = 10 maxLocalAuc = MaxLocalAUC(k, u, eps=eps, stochastic=True) maxLocalAuc.rate = "constant" maxLocalAuc.maxIterations = 500 maxLocalAuc.numProcesses = 1 maxLocalAuc.loss = "hinge" maxLocalAuc.validationUsers = 0 maxLocalAuc.lmbda = 0 print("Overfit example") U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel( X, verbose=True) self.assertAlmostEquals(trainMeasures[-1, 0], 0, 3)
def f1AtK(positiveArray, orderedItems, k, verbose=False): """ Return the F1@k measure for each row of the predicted matrix UV.T using real values in positiveArray. positiveArray is a tuple (indPtr, colInds) :param orderedItems: The ordered items for each user (users are rows, items are cols) :param verbose: If true return recall and first k recommendation for each row, otherwise just precisions """ if type(positiveArray) != tuple: positiveArray = SparseUtils.getOmegaListPtr(positiveArray) orderedItems = orderedItems[:, 0:k] indPtr, colInds = positiveArray precisions = MCEvaluatorCython.precisionAtk(indPtr, colInds, orderedItems) recalls = MCEvaluatorCython.recallAtk(indPtr, colInds, orderedItems) denominator = precisions+recalls denominator += denominator == 0 f1s = 2*precisions*recalls/denominator if verbose: return f1s, orderedItems else: return f1s.mean()
def testAverageRocCurve(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix( (m, n), k, w, csarray=True, verbose=True, indsPerRow=200 ) fpr, tpr = MCEvaluator.averageRocCurve(X, U, V) import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt # plt.plot(fpr, tpr) # plt.show() # Now try case where we have a training set folds = 1 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
def localAUCApprox(positiveArray, U, V, w, numAucSamples=50, r=None, allArray=None): """ Compute the estimated local AUC for the score functions UV^T relative to X with quantile w. The AUC is computed using positiveArray which is a tuple (indPtr, colInds) assuming allArray is None. If allArray is not None then positive items are chosen from positiveArray and negative ones are chosen to complement allArray. """ if type(positiveArray) != tuple: positiveArray = SparseUtils.getOmegaListPtr(positiveArray) indPtr, colInds = positiveArray U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) if r is None: r = SparseUtilsCython.computeR(U, V, w, numAucSamples) if allArray is None: return MCEvaluatorCython.localAUCApprox(indPtr, colInds, indPtr, colInds, U, V, numAucSamples, r) else: allIndPtr, allColInd = allArray return MCEvaluatorCython.localAUCApprox(indPtr, colInds, allIndPtr, allColInd, U, V, numAucSamples, r)
def testParallelSparseLowRankOp(self): numRuns = 10 for i in range(numRuns): m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) density = numpy.random.rand() A = scipy.sparse.rand(m, n, density) A = A.tocsc() r = numpy.random.randint(10, 100) U, s, V = SparseUtils.generateLowRank((m, n), r) L = LinOperatorUtils.parallelSparseLowRankOp(A, U, s, V) u = numpy.random.rand(m) v = numpy.random.rand(n) r = 10 W = numpy.random.rand(m, r) X = numpy.random.rand(n, r) B = numpy.array(A+(U*s).dot(V.T)) nptst.assert_array_almost_equal(L.matvec(v), B.dot(v)) nptst.assert_array_almost_equal(L.rmatvec(u), B.T.dot(u)) nptst.assert_array_almost_equal(L.matmat(X), B.dot(X)) nptst.assert_array_almost_equal(L.rmatmat(W), B.T.dot(W))
def testLocalAUC(self): m = 10 n = 20 k = 2 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, 0.5, verbose=True, csarray=True) Z = U.dot(V.T) localAuc = numpy.zeros(m) for i in range(m): localAuc[i] = sklearn.metrics.roc_auc_score( numpy.ravel(X[i, :].toarray()), Z[i, :]) localAuc = localAuc.mean() u = 0.0 localAuc2 = MCEvaluator.localAUC(X, U, V, u) self.assertEquals(localAuc, localAuc2) #Now try a large r w = 1.0 localAuc2 = MCEvaluator.localAUC(X, U, V, w) self.assertEquals(localAuc2, 0)
def stratifiedRecallAtK(positiveArray, orderedItems, k, itemCounts, beta=0.5, verbose=False): """ Compute the average recall@k score for each row of the predicted matrix UV.T using real values in positiveArray. positiveArray is a tuple (indPtr, colInds) :param orderedItems: The ordered items for each user (users are rows, items are cols) :param verbose: If true return recall and first k recommendation for each row, otherwise just precisions """ if type(positiveArray) != tuple: positiveArray = SparseUtils.getOmegaListPtr(positiveArray) orderedItems = orderedItems[:, 0:k] indPtr, colInds = positiveArray recalls, denominators = MCEvaluatorCython.stratifiedRecallAtk( indPtr, colInds, orderedItems, itemCounts, beta) if verbose: return recalls, orderedItems else: return numpy.average(recalls, weights=denominators)
def testAverageRocCurve(self): m = 50 n = 20 k = 8 u = 20.0 / m w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, csarray=True, verbose=True, indsPerRow=200) fpr, tpr = MCEvaluator.averageRocCurve(X, U, V) import matplotlib matplotlib.use("GTK3Agg") import matplotlib.pyplot as plt #plt.plot(fpr, tpr) #plt.show() #Now try case where we have a training set folds = 1 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
def f1AtK(positiveArray, orderedItems, k, verbose=False): """ Return the F1@k measure for each row of the predicted matrix UV.T using real values in positiveArray. positiveArray is a tuple (indPtr, colInds) :param orderedItems: The ordered items for each user (users are rows, items are cols) :param verbose: If true return recall and first k recommendation for each row, otherwise just precisions """ if type(positiveArray) != tuple: positiveArray = SparseUtils.getOmegaListPtr(positiveArray) orderedItems = orderedItems[:, 0:k] indPtr, colInds = positiveArray precisions = MCEvaluatorCython.precisionAtk(indPtr, colInds, orderedItems) recalls = MCEvaluatorCython.recallAtk(indPtr, colInds, orderedItems) denominator = precisions + recalls denominator += denominator == 0 f1s = 2 * precisions * recalls / denominator if verbose: return f1s, orderedItems else: return f1s.mean()
def testModelSelect(self): m = 50 n = 50 k = 5 u = 0.5 w = 1 - u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w) os.system('taskset -p 0xffffffff %d' % os.getpid()) u = 0.2 lmbda = 0.1 gamma = 0.01 learner = BprRecommender(k, lmbda, gamma) learner.maxIterations = 2 learner.ks = 2**numpy.arange(3, 5) learner.lmbdaUsers = 2.0**-numpy.arange(1, 3) learner.lmbdaPoses = 2.0**-numpy.arange(1, 3) learner.lmbdaNegs = 2.0**-numpy.arange(1, 3) learner.gammas = 2.0**-numpy.arange(1, 3) learner.folds = 2 learner.numProcesses = 1 colProbs = numpy.array(X.sum(1)).ravel() colProbs /= colProbs.sum() print(colProbs, colProbs.shape) learner.modelSelect(X, colProbs=colProbs)
def profileObjective(self): k = 10 U = numpy.random.rand(self.m, k) V = numpy.random.rand(self.n, k) indPtr, colInds = SparseUtils.getOmegaListPtr(self.X) colIndsProbabilities = numpy.ones(colInds.shape[0]) for i in range(self.m): colIndsProbabilities[indPtr[i]:indPtr[ i + 1]] /= colIndsProbabilities[indPtr[i]:indPtr[i + 1]].sum() colIndsProbabilities[indPtr[i]:indPtr[i + 1]] = numpy.cumsum( colIndsProbabilities[indPtr[i]:indPtr[i + 1]]) r = numpy.zeros(self.m) lmbda = 0.001 rho = 1.0 numAucSamples = 100 def run(): numRuns = 10 for i in range(numRuns): objectiveApprox(indPtr, colInds, indPtr, colInds, U, V, r, numAucSamples, lmbda, rho, False) ProfileUtils.profile('run()', globals(), locals())
def uncenter(self, X): """ Uncenter a training or test matrix. """ #logging.debug("Uncentering matrix of size: " + str(X.shape)) return SparseUtils.uncenterRows(X, self.muRows)
def testModelSelect(self): m = 50 n = 50 k = 5 u = 0.5 w = 1-u X = SparseUtils.generateSparseBinaryMatrix((m, n), k, w) os.system('taskset -p 0xffffffff %d' % os.getpid()) u = 0.2 lmbda = 0.1 gamma = 0.01 learner = BprRecommender(k, lmbda, gamma) learner.maxIterations = 2 learner.ks = 2**numpy.arange(3, 5) learner.lmbdaUsers = 2.0**-numpy.arange(1, 3) learner.lmbdaPoses = 2.0**-numpy.arange(1, 3) learner.lmbdaNegs = 2.0**-numpy.arange(1, 3) learner.gammas = 2.0**-numpy.arange(1, 3) learner.folds = 2 learner.numProcesses = 1 colProbs = numpy.array(X.sum(1)).ravel() colProbs /= colProbs.sum() print(colProbs, colProbs.shape) learner.modelSelect(X, colProbs=colProbs)
def testSvdArpack(self): shape = (500, 100) r = 5 k = 1000 X, U, s, V = SparseUtils.generateSparseLowRank(shape, r, k, verbose=True) k2 = 10 U, s, V = SparseUtils.svdArpack(X, k2) U2, s2, V2 = numpy.linalg.svd(X.todense()) V2 = V2.T nptst.assert_array_almost_equal(s, s2[0:k2]) nptst.assert_array_almost_equal(numpy.abs(U), numpy.abs(U2[:, 0:k2]), 3) nptst.assert_array_almost_equal(numpy.abs(V), numpy.abs(V2[:, 0:k2]), 3)
def testParallelSparseLowRankOp(self): numRuns = 10 for i in range(numRuns): m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) density = numpy.random.rand() A = scipy.sparse.rand(m, n, density) A = A.tocsc() r = numpy.random.randint(10, 100) U, s, V = SparseUtils.generateLowRank((m, n), r) L = LinOperatorUtils.parallelSparseLowRankOp(A, U, s, V) u = numpy.random.rand(m) v = numpy.random.rand(n) r = 10 W = numpy.random.rand(m, r) X = numpy.random.rand(n, r) B = numpy.array(A + (U * s).dot(V.T)) nptst.assert_array_almost_equal(L.matvec(v), B.dot(v)) nptst.assert_array_almost_equal(L.rmatvec(u), B.T.dot(u)) nptst.assert_array_almost_equal(L.matmat(X), B.dot(X)) nptst.assert_array_almost_equal(L.rmatmat(W), B.T.dot(W))
def flixster(minNnzRows=10, minNnzCols=2, quantile=90): matrixFileName = PathDefaults.getDataDir() + "flixster/Ratings.timed.txt" matrixFile = open(matrixFileName) matrixFile.readline() userIndexer = IdIndexer("i") movieIndexer = IdIndexer("i") ratings = array.array("f") logging.debug("Loading ratings from " + matrixFileName) for i, line in enumerate(matrixFile): if i % 1000000 == 0: logging.debug("Iteration: " + str(i)) vals = line.split() userIndexer.append(vals[0]) movieIndexer.append(vals[1]) ratings.append(float(vals[2])) rowInds = userIndexer.getArray() colInds = movieIndexer.getArray() ratings = numpy.array(ratings) X = sppy.csarray((len(userIndexer.getIdDict()), len(movieIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) #X = Sampling.sampleUsers(X, 1000) return X
def epinions(minNnzRows=10, minNnzCols=3, quantile=90): matrixFileName = PathDefaults.getDataDir() + "epinions/rating.mat" A = scipy.io.loadmat(matrixFileName)["rating"] userIndexer = IdIndexer("i") itemIndexer = IdIndexer("i") for i in range(A.shape[0]): userIndexer.append(A[i, 0]) itemIndexer.append(A[i, 1]) rowInds = userIndexer.getArray() colInds = itemIndexer.getArray() ratings = A[:, 3] X = sppy.csarray((len(userIndexer.getIdDict()), len(itemIndexer.getIdDict())), storagetype="row", dtype=numpy.int) X.put(numpy.array(ratings>3, numpy.int), numpy.array(rowInds, numpy.int32), numpy.array(colInds, numpy.int32), init=True) X.prune() X = SparseUtils.pruneMatrixRowAndCols(X, minNnzRows, minNnzCols) logging.debug("Read file: " + matrixFileName) logging.debug("Non zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) return X
def testF1Atk(self): m = 10 n = 5 r = 3 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) import sppy X = sppy.csarray(X) orderedItems = MCEvaluator.recommendAtk(U * s, V, n) self.assertAlmostEquals( MCEvaluator.f1AtK(X, orderedItems, n, verbose=False), 2 * r / float(n) / (1 + r / float(n)) ) m = 20 n = 50 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), r, 0.5, verbose=True) k = 5 orderedItems = MCEvaluator.recommendAtk(U * s, V, k) precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) f1s = numpy.zeros(m) for i in range(m): f1s[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) orderedItems = MCEvaluator.recommendAtk(U * s, V, n) f1s2, scoreInds = MCEvaluator.f1AtK(X, orderedItems, k, verbose=True) nptst.assert_array_equal(f1s, f1s2) # Test case where we get a zero precision or recall orderedItems[5, :] = -1 precision, scoreInds = MCEvaluator.precisionAtK(X, orderedItems, k, verbose=True) recall, scoreInds = MCEvaluator.recallAtK(X, orderedItems, k, verbose=True) f1s = numpy.zeros(m) for i in range(m): if precision[i] + recall[i] != 0: f1s[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]) f1s2, scoreInds = MCEvaluator.f1AtK(X, orderedItems, k, verbose=True) nptst.assert_array_equal(f1s, f1s2)
def testGetOmegaList(self): import sppy m = 10 n = 5 X = scipy.sparse.rand(m, n, 0.1) X = X.tocsr() omegaList = SparseUtils.getOmegaList(X) for i in range(m): nptst.assert_array_almost_equal(omegaList[i], X.toarray()[i, :].nonzero()[0]) Xsppy = sppy.csarray(X) omegaList = SparseUtils.getOmegaList(Xsppy) for i in range(m): nptst.assert_array_almost_equal(omegaList[i], X.toarray()[i, :].nonzero()[0])
def testDiag(self): numRows = 10 numCols = 10 A = scipy.sparse.rand(numRows, numCols, 0.5, "csr") d = SparseUtils.diag(A) for i in range(numRows): self.assertEquals(d[i], A[i, i])
def testPruneMatrixCols(self): m = 30 n = 20 density = 0.5 X = sppy.rand((m, n), density) X[X.nonzero()] = 1 newX, rowInds = SparseUtils.pruneMatrixCols(X, maxNnz=10, verbose=True) nnzCols = numpy.zeros(n) for i in range(n): nnzCols[i] = X.toarray()[:, i].nonzero()[0].shape[0] if nnzCols[i] <= 10: self.assertTrue(i in rowInds) self.assertTrue((newX.sum(0) <= 10).all()) newX, rowInds = SparseUtils.pruneMatrixCols(X, minNnz=10, verbose=True) nnzCols = numpy.zeros(n) for i in range(n): nnzCols[i] = X.toarray()[:, i].nonzero()[0].shape[0] if nnzCols[i] >= 10: self.assertTrue(i in rowInds) self.assertTrue((newX.sum(0) >= 10).all()) newX, rowInds = SparseUtils.pruneMatrixCols(X, minNnz=10, maxNnz=15, verbose=True) nnzCols = numpy.zeros(n) for i in range(n): nnzCols[i] = X.toarray()[:, i].nonzero()[0].shape[0] if nnzCols[i] >= 10 and nnzCols[i] <= 15: self.assertTrue(i in rowInds) self.assertTrue( numpy.logical_and(newX.sum(0) >= 10, newX.sum(0) <= 15).all())
def testSampleUsers2(self): m = 10 n = 15 r = 5 u = 0.3 w = 1-u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) k = X.nnz+100 X2, userInds = Sampling.sampleUsers2(X, k) nptst.assert_array_equal(X.toarray(), X2.toarray()) #Test pruning of cols k = 500 m = 100 n = 500 u = 0.1 w = 1 - u X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) numpy.random.seed(21) X2, userInds = Sampling.sampleUsers2(X, k, prune=True) nnz1 = X2.nnz self.assertTrue((X2.sum(0)!=0).all()) numpy.random.seed(21) X2, userInds = Sampling.sampleUsers2(X, k, prune=False) nnz2 = X2.nnz self.assertEquals(nnz1, nnz2) numRuns = 50 for i in range(numRuns): m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) k = 500 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), r, w, csarray=True, verbose=True, indsPerRow=200) X2, userInds = Sampling.sampleUsers2(X, k) self.assertTrue((X.dot(X.T)!=numpy.zeros((m, m)).all())) self.assertTrue((X2.toarray() == X.toarray()[userInds, :]).all()) self.assertEquals(X.toarray()[userInds, :].nonzero()[0].shape[0], X2.nnz)
def __init__(self): numpy.random.seed(21) #Create a low rank matrix self.m = 1000 self.n = 5000 self.k = 10 self.X = SparseUtils.generateSparseBinaryMatrix((self.m, self.n), self.k, csarray=True)
def testResize(self): numRows = 10 numCols = 10 A = scipy.sparse.rand(numRows, numCols, 0.1, "csr") B = SparseUtils.resize(A, (5, 5)) self.assertEquals(B.shape, (5, 5)) for i in range(5): for j in range(5): self.assertEquals(B[i, j], A[i, j]) B = SparseUtils.resize(A, (15, 15)) self.assertEquals(B.shape, (15, 15)) self.assertEquals(B.nnz, A.nnz) for i in range(10): for j in range(10): self.assertEquals(B[i, j], A[i, j])
def profileGetOmegaList(self): shape = (20000, 15000) r = 50 k = 1000000 X = SparseUtils.generateSparseLowRank(shape, r, k) import sppy X = sppy.csarray(X) ProfileUtils.profile('SparseUtils.getOmegaList(X)', globals(), locals())