def __init__(self): numpy.random.seed(21) #Create a low rank matrix # n = 100000 # m = 100000 # self.r = 50 # nKnown = 10**4 # netflix-like # n = 480000 # m = 18000 # self.r = 200 # nKnown = 10**8 # close from netflix n = 480000 m = 18000 self.r = 200 nKnown = 10**6 # focusing on scalar-product n = 480000 m = 18000 self.r = 50 nKnown = 10**5 self.X = SparseUtils.generateSparseLowRank((n, m), self.r, nKnown) print(self.X.nnz)
def benchmark(self): numMatrices = 20 matrixList = [] print("Generating matrices") for i in range(numMatrices): print("Iteration: " + str(i)) m = numpy.random.randint(5000, 20000) n = numpy.random.randint(5000, 20000) density = numpy.random.rand() * 0.1 X = scipy.sparse.rand(m, n, density) r = numpy.random.randint(10, 50) U, s, V = SparseUtils.generateLowRank((m, n), r) print(m, n, density, r) matrixList.append((X, U, s, V)) k = 10 times = [] print("Starting timings for ARPACK") start = time.time() for i, matrices in enumerate(matrixList): print("Iteration: " + str(i)) X, U, s, V = matrices SVDUpdate.addSparseArpack(U, s, V, X, k) times.append(time.time() - start) #Compare versus PROPACK print("Starting timings for PROPACK") start = time.time() for i, matrices in enumerate(matrixList): print("Iteration: " + str(i)) X, U, s, V = matrices SparseUtils.svdSparseLowRank(X, U, s, V, k) times.append(time.time() - start) print(times)
def __init__(self): numpy.random.seed(21) #Create a low rank matrix n = 100000 m = 100000 self.r = 200 k = 10**6 self.X = SparseUtils.generateSparseLowRank((n, m), self.r, k)
def __init__(self): numpy.random.seed(21) #Create a low rank matrix n = 500000 m = 500000 self.r = 200 k = 10**6 print("Generating low rank") self.X = SparseUtils.generateSparseLowRank((n, m), self.r, k) print("Generating csarray") self.X = csarray.fromScipySparse(self.X, storageType="rowMajor") print("Done")
#Test if we can easily get the SVD of a set of matrices with low rank but under #a fixed structure import numpy import scipy.sparse from exp.util.SparseUtils import SparseUtils numpy.set_printoptions(suppress=True, precision=3, linewidth=150) shape = (15, 20) r = 10 k = 50 X, U, s, V = SparseUtils.generateSparseLowRank(shape, r, k, verbose=True) X = numpy.array(X.todense()) Y = numpy.zeros(X.shape) Y[X.nonzero()] = 1 print(Y) U2, s2, V2 = numpy.linalg.svd(Y) print(s2) X2 = numpy.zeros(X.shape) for i in range(r): X2 += s[i]*numpy.diag(U[:,i]).dot(Y).dot(numpy.diag(V[:, i]))
""" import sys import logging import scipy.sparse import numpy from sparsesvd import sparsesvd from exp.util.SparseUtils import SparseUtils numpy.random.seed(21) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numpy.set_printoptions(precision=3, suppress=True, linewidth=100) m = 10 n = 10 r = 1 U0, s0, V0 = SparseUtils.generateLowRank((m, n), r) numInds = 10 inds = numpy.unique(numpy.random.randint(0, m * n, numInds)) A = SparseUtils.reconstructLowRank(U0, s0, V0, inds) #print(A.todense()) t0 = s0 + numpy.random.rand(s0.shape[0]) * 0.1 B = SparseUtils.reconstructLowRank(U0, t0, V0, inds) #print(B.todense()) k = 9 U, s, V = sparsesvd(A, k) U2, s2, V2 = sparsesvd(B, k) print(A.todense())
#generator = SyntheticDataset1(startM=5000, endM=10000, startN=1000, endN=1500, pnz=0.10, noise=0.01) #generator = FlixsterDataset() generator = MovieLensDataset() iterator = CenterMatrixIterator(generator.getTrainIteratorFunc()) k = 50 for i in range(1): X = iterator.next() if i==0: lastX = scipy.sparse.csc_matrix(X.shape) print("About to compute SVD") U, s, V = SparseUtils.svdPropack(X, k) print("Computed SVD") plt.figure(0) plt.plot(numpy.arange(s.shape[0]), s) """ deltaX = X - lastX deltaX.eliminate_zeros() deltaX.prune() print(X.nnz-lastX.nnz) U, s, V = SparseUtils.svdPropack(deltaX, k) plt.figure(1) plt.plot(numpy.arange(s.shape[0]), s) lastX = X """
""" import sys import logging import scipy.sparse import numpy from sparsesvd import sparsesvd from exp.util.SparseUtils import SparseUtils numpy.random.seed(21) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numpy.set_printoptions(precision=3, suppress=True, linewidth=100) m = 10 n = 10 r = 1 U0, s0, V0 = SparseUtils.generateLowRank((m, n), r) numInds = 10 inds = numpy.unique(numpy.random.randint(0, m*n, numInds)) A = SparseUtils.reconstructLowRank(U0, s0, V0, inds) #print(A.todense()) t0 = s0 + numpy.random.rand(s0.shape[0])*0.1 B = SparseUtils.reconstructLowRank(U0, t0, V0, inds) #print(B.todense()) k = 9 U, s, V = sparsesvd(A, k) U2, s2, V2 = sparsesvd(B, k) print(A.todense())
lastX = X else: E = X - lastX E.eliminate_zeros() print(X.nnz, E.nnz) startTime = time.time() U3, s3, V3 = RandomisedSVD.updateSvd(X, U3, s3, V3, E, k, p) times[i, 1] = time.time() - startTime lastX = X errors[i, 1] = numpy.linalg.norm(X - (U3*s3).dot(V3.T)) #Accurate method startTime = time.time() U4, s4, V4 = SparseUtils.svdPropack(X, k) times[i, 2] = time.time() - startTime errors[i, 2] = numpy.linalg.norm(X - (U4*s4).dot(V4.T)) #Final method - just use the same SVD if i == 0: startTime = time.time() U5, s5, V5 = SparseUtils.svdPropack(X, k) times[i, 3] = time.time() - startTime errors[i, 3] = numpy.linalg.norm(X - (U5*s5).dot(V5.T)) cumtimes = numpy.cumsum(times, 0) print(cumtimes)
X = vectoriser.fit_transform(documentList) print(vectoriser.get_feature_names()) corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False) id2WordDict = dict(zip(range(len(vectoriser.get_feature_names())), vectoriser.get_feature_names())) k = 10 logging.getLogger('gensim').setLevel(logging.INFO) lda = LdaModel(corpus, num_topics=k, id2word=id2WordDict, chunksize=1000, distributed=False) index = gensim.similarities.docsim.SparseMatrixSimilarity(lda[corpus], num_features=k) newX = vectoriser.transform(["graph"]) newX = [(i, newX[0, i])for i in newX.nonzero()[1]] result = lda[newX] similarities = index[result] similarities = sorted(enumerate(similarities), key=lambda item: -item[1]) print(similarities) #Compute Helliger distance result = [i[1] for i in result] newX = scipy.sparse.csc_matrix(result) distances = SparseUtils.hellingerDistances(index.index, newX) print(1 - distances) #Try cosine metric X = Standardiser().normaliseArray(numpy.array(index.index.todense()).T).T newX = numpy.array(newX.todense()) similarities = X.dot(newX.T).flatten() print(similarities)