def mapfunc_sampling(it, rank): import_tools() mtx = it.next() m, n = mtx.shape y = numpy.zeros(dtype=mtx.dtype, shape=(m, rank)) o = numpy.random.normal(0.0, 1.0, (n, rank)).astype(y.dtype) sparsetools.csc_matvecs(m, n, rank, mtx.indptr, mtx.indices, mtx.data, o.ravel(), y.ravel()) del o return enumerate(y.T)
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, power_iters=0, dtype=np.float64, eps=1e-6): """Run truncated Singular Value Decomposition (SVD) on a sparse input. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse} Input corpus as a stream (does not have to fit in RAM) or a sparse matrix of shape (`num_terms`, num_documents). rank : int Desired number of factors to be retained after decomposition. num_terms : int The number of features (terms) in `corpus`. chunksize : int, optional Number of documents to be used in each training chunk. extra_dims : int, optional Extra samples to be used besides the rank `k`. Can improve accuracy. power_iters: int, optional Number of power iteration steps to be used. Increasing the number of power iterations improves accuracy, but lowers performance. dtype : numpy.dtype, optional Enforces a type for elements of the decomposed matrix. eps: float, optional Percentage of the spectrum's energy to be discarded. Notes ----- The corpus may be larger than RAM (iterator of vectors), if `corpus` is a `scipy.sparse.csc` instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. This may return less than the requested number of top `rank` factors, in case the input itself is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2 + power_iters` passes over the input data. In case you can only afford a single pass, set `onepass=True` in :class:`~gensim.models.lsimodel.LsiModel` and avoid using this function directly. The decomposition algorithm is based on `"Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions" <https://arxiv.org/abs/0909.4061>`_. Returns ------- (np.ndarray 2D, np.ndarray 1D) The left singular vectors and the singular values of the `corpus`. """ rank = int(rank) if extra_dims is None: samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations", samples - rank, power_iters) num_terms = int(num_terms) # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunksize` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = np.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix", str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) o = np.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o # unlike np, scipy.sparse `astype()` copies everything, even if there is no change to dtype! # so check for equal dtype explicitly, to avoid the extra memory footprint if possible if y.dtype != dtype: y = y.astype(dtype) logger.info("orthonormalizing %s action matrix", str(y.shape)) y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range logger.debug("running %i power iterations", power_iters) for _ in range(power_iters): q = corpus.T * q q = [corpus * q] q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step else: num_docs = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i', (chunk_no * chunksize)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunksize) matrix! s = sum(len(doc) for doc in chunk) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = np.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o chunk.data, o.ravel(), y.ravel() ) del chunk, o y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range for power_iter in range(power_iters): logger.info("running power iteration #%i", power_iter + 1) yold = q.copy() q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs) # documents = columns of sparse CSC chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) tmp = chunk.T * yold tmp = chunk * tmp del chunk q += tmp del yold q = [q] q, _ = matutils.qr_destroy(q) # orthonormalize the range qt = q[:, :samples].T.copy() del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix", str(b.shape)) u, s, vt = scipy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunksize` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = np.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=dtype) logger.info("2nd phase: constructing %s covariance matrix", str(x.shape)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype) b = qt * chunk # dense * sparse matrix multiply del chunk x += np.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix", str(x.shape)) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) u, s, vt = scipy.linalg.svd(x) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus s = np.sqrt(s) q = qt.T.copy() del qt logger.info("computing the final decomposition") keep = clip_spectrum(s ** 2, rank, discard=eps) u = u[:, :keep].copy() s = s[:keep] u = np.dot(q, u) return u.astype(dtype), s.astype(dtype)
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Run truncated Singular Value Decomposition (SVD) on a sparse input. Return (U, S): the left singular vectors and the singular values of the input data stream `corpus` [4]_. The corpus may be larger than RAM (iterator of vectors). This may return less than the requested number of top `rank` factors, in case the input itself is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the input data. In case you can only afford a single pass, set `onepass=True` in :class:`LsiModel` and avoid using this function directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [4] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunksize` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype! # so check for equal dtype explicitly, to avoid the extra memory footprint if possible if y.dtype != dtype: y = y.astype(dtype) logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): q = corpus.T * q q = [corpus * q] q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step else: num_docs = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunksize) matrix! s = sum(len(doc) for doc in chunk) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o chunk.data, o.ravel(), y.ravel()) del chunk, o y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = q.copy() q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk q += tmp del yold q = [q] q, _ = matutils.qr_destroy(q) # orthonormalize the range qt = q[:, :samples].T.copy() del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = scipy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunksize` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype) b = qt * chunk # dense * sparse matrix multiply del chunk x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = scipy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus q = qt.T.copy() del qt logger.info("computing the final decomposition") keep = clip_spectrum(s**2, rank, discard=eps) u = u[:, :keep].copy() s = s[:keep] u = numpy.dot(q, u) return u.astype(dtype), s.astype(dtype)
def stochasticSvd(corpus, rank, num_terms, chunks=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Return (U, S): the left singular vectors and the singular values of the streamed input corpus `corpus` [3]_. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the data. In case you can only afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` and avoid using this algorithm directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max( 10, 2 * rank ) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) eps = max( float(eps), 1e-9 ) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunks` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % ( m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o y = y.astype( dtype ) # TODO unlike numpy, scipy actually makes a copy even when dtype=y.dtype...marginally inefficient logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): y = corpus.T * y y = corpus * y else: chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) num_docs = 0 for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunks) matrix! chunk = matutils.corpus2csc( (doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunks # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( num_terms, n, samples, chunk.indptr, # y = y + chunk * o chunk.indices, chunk.data, o.ravel(), y.ravel()) del chunk, o for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = y.copy() y[:] = 0.0 chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs)) chunk = matutils.corpus2csc( (doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk y += tmp del yold logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, r = matutils.qr_destroy(y) # orthonormalize the range del y samples = clipSpectrum(numpy.diag(r), samples, discard=eps) qt = numpy.asfortranarray( q[:, :samples].T ) # discard bogus columns, in case Y was rank-deficient del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = numpy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunks` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(samples, samples), dtype=dtype) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) b = qt * chunk # dense * sparse matrix multiply x += numpy.dot( b, b.T ) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del chunk, b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd( x ) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt( s ) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus logger.info("computing the final decomposition") keep = clipSpectrum(s**2, rank, discard=eps) u = numpy.asfortranarray(u[:, :keep]) s = s[:keep] gemm = matutils.blas('gemm', u) u = gemm(1.0, qt, u, trans_a=True) return u, s
def stochasticSvd(corpus, rank, num_terms=None, chunks=20000, extra_dims=None, dtype=numpy.float64, eps=1e-6): """ Return U, S -- the left singular vectors and the singular values of the streamed input corpus. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. Also note that the decomposition is unique up the the sign of the left singular vectors (columns of U). This is a streamed, two-pass algorithm, without power-iterations. In case you can only afford a single pass over the input corpus, set `onepass=True` in LsiModel and avoid using this algorithm. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** """ rank = int(rank) if extra_dims is None: samples = max( 10, 2 * rank ) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) if num_terms is None: logger.warning( "number of terms not provided; will scan the corpus (ONE EXTRA PASS, MAY BE SLOW) to determine it" ) num_terms = len(utils.dictFromCorpus(corpus)) logger.info("found %i terms" % num_terms) else: num_terms = int(num_terms) eps = max( float(eps), 1e-9 ) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage # first pass: construct the orthonormal action matrix Q = orth(Y) = orth(A * O) # build Y in blocks of `chunks` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st pass: constructing %s action matrix" % str(y.shape)) chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunks) matrix! chunk = matutils.corpus2csc( (doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunks # the very last chunk of A may be smaller logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( num_terms, n, samples, chunk.indptr, # y = y + chunk * o chunk.indices, chunk.data, o.ravel(), y.ravel()) del chunk, o logger.info("orthonormalizing %s action matrix" % str(y.shape)) q, r = numpy.linalg.qr(y) # orthonormalize the range del y # Y not needed anymore, free up mem samples = clipSpectrum(numpy.diag(r), samples, discard=eps) qt = q[:, :samples].T.copy( ) # discard bogus columns, in case Y was rank-deficient del q # second pass: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunks` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(samples, samples), dtype=dtype) logger.info("2nd pass: constructing %s covariance matrix" % str(x.shape)) chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) b = qt * chunk # dense * sparse matrix multiply x += numpy.dot( b, b.T ) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del chunk, b # now we're ready to compute decomposition of the small matrix X logger.info("computing decomposition of the %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd( x ) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) keep = clipSpectrum(s, rank, discard=eps) logger.info("computing the final decomposition") s = numpy.sqrt( s[:keep] ) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus u = numpy.dot( qt.T, u[:, :keep] ) # go back from left singular vectors of B to left singular vectors of the corpus return u.astype(dtype), s.astype(dtype)
def __mul__(self, other): """ Matrix multiplication :param other: CscMat instance :return: CscMat instance """ if isinstance(other, CscMat): # mat-mat multiplication # 2-pass matrix multiplication Cp = np.empty(self.n + 1, dtype=np.int32) sptools.csc_matmat_pass1(self.n, other.m, self.indptr, self.indices, other.indptr, other.indices, Cp) nnz = Cp[-1] Ci = np.empty(nnz, dtype=np.int32) Cx = np.empty(nnz, dtype=np.float64) sptools.csc_matmat_pass2(self.n, other.m, self.indptr, self.indices, self.data, other.indptr, other.indices, other.data, Cp, Ci, Cx) return CscMat(n=self.m, m=other.m, indptr=Cp, indices=Ci, data=Cx) elif isinstance( other, np.ndarray): # multiply by a vector or array of vectors if len(other.shape) == 1: y = np.zeros(self.m, dtype=np.float64) sptools.csc_matvec(self.m, self.n, self.indptr, self.indices, self.data, other, y) return y elif len(other.shape) == 2: ''' * Input Arguments: * I n_row - number of rows in A * I n_col - number of columns in A * I n_vecs - number of column vectors in X and Y * I Ap[n_row+1] - row pointer * I Aj[nnz(A)] - column indices * T Ax[nnz(A)] - nonzeros * T Xx[n_col,n_vecs] - input vector * * Output Arguments: * T Yx[n_row,n_vecs] - output vector * * Note: * Output array Yx must be preallocated * void csc_matvecs(const I n_row, const I n_col, const I n_vecs, const I Ap[], const I Ai[], const T Ax[], const T Xx[], T Yx[]) ''' n_col, n_vecs = other.shape y = np.zeros((self.m, n_vecs), dtype=np.float64) sptools.csc_matvecs(self.m, self.n, n_vecs, self.indptr, self.indices, self.data, other, y) return y elif isinstance(other, float) or isinstance( other, int): # multiply by a scalar value C = self.copy() C.data *= other return C else: raise Exception('Type not supported')
def stochasticSvd(corpus, rank, num_terms=None, chunks=20000, extra_dims=None, dtype=numpy.float64, eps=1e-6): """ Return U, S -- the left singular vectors and the singular values of the streamed input corpus. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. Also note that the decomposition is unique up the the sign of the left singular vectors (columns of U). This is a streamed, two-pass algorithm, without power-iterations. In case you can only afford a single pass over the input corpus, set `onepass=True` in LsiModel and avoid using this algorithm. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** """ rank = int(rank) if extra_dims is None: samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) if num_terms is None: logger.warning("number of terms not provided; will scan the corpus (ONE EXTRA PASS, MAY BE SLOW) to determine it") num_terms = len(utils.dictFromCorpus(corpus)) logger.info("found %i terms" % num_terms) else: num_terms = int(num_terms) eps = max(float(eps), 1e-9) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage # first pass: construct the orthonormal action matrix Q = orth(Y) = orth(A * O) # build Y in blocks of `chunks` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype = dtype, shape = (num_terms, samples)) logger.info("1st pass: constructing %s action matrix" % str(y.shape)) chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunks) matrix! chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunks # the very last chunk of A may be smaller logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(num_terms, n, samples, chunk.indptr, # y = y + chunk * o chunk.indices, chunk.data, o.ravel(), y.ravel()) del chunk, o logger.info("orthonormalizing %s action matrix" % str(y.shape)) q, r = numpy.linalg.qr(y) # orthonormalize the range del y # Y not needed anymore, free up mem samples = clipSpectrum(numpy.diag(r), samples, discard = eps) qt = q[:, :samples].T.copy() # discard bogus columns, in case Y was rank-deficient del q # second pass: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunks` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape = (samples, samples), dtype = dtype) logger.info("2nd pass: constructing %s covariance matrix" % str(x.shape)) chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) b = qt * chunk # dense * sparse matrix multiply x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del chunk, b # now we're ready to compute decomposition of the small matrix X logger.info("computing decomposition of the %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) keep = clipSpectrum(s, rank, discard = eps) logger.info("computing the final decomposition") s = numpy.sqrt(s[:keep]) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus u = numpy.dot(qt.T, u[:, :keep]) # go back from left singular vectors of B to left singular vectors of the corpus return u.astype(dtype), s.astype(dtype)
def stochasticSvd(corpus, rank, num_terms, chunks=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Return (U, S): the left singular vectors and the singular values of the streamed input corpus `corpus` [3]_. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the data. In case you can only afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` and avoid using this algorithm directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) eps = max(float(eps), 1e-9) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunks` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype = dtype, shape = (num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o y = y.astype(dtype) # TODO unlike numpy, scipy actually makes a copy even when dtype=y.dtype...marginally inefficient logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): y = corpus.T * y y = corpus * y else: chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) num_docs = 0 for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunks) matrix! chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunks # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(num_terms, n, samples, chunk.indptr, # y = y + chunk * o chunk.indices, chunk.data, o.ravel(), y.ravel()) del chunk, o for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = y.copy() y[:] = 0.0 chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk y += tmp del yold logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, r = matutils.qr_destroy(y) # orthonormalize the range del y samples = clipSpectrum(numpy.diag(r), samples, discard = eps) qt = numpy.asfortranarray(q[:, :samples].T) # discard bogus columns, in case Y was rank-deficient del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = numpy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunks` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape = (samples, samples), dtype = dtype) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) b = qt * chunk # dense * sparse matrix multiply x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del chunk, b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus logger.info("computing the final decomposition") keep = clipSpectrum(s**2, rank, discard=eps) u = numpy.asfortranarray(u[:, :keep]) s = s[:keep] gemm = matutils.blas('gemm', u) u = gemm(1.0, qt, u, trans_a=True) return u, s
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Return (U, S): the left singular vectors and the singular values of the streamed input corpus `corpus` [3]_. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the data. In case you can only afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` and avoid using this algorithm directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max( 10, 2 * rank ) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunksize` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % ( m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype! # so check for equal dtype explicitly, to avoid the extra memory footprint if possible if y.dtype != dtype: y = y.astype(dtype) logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): q = corpus.T * q q = [corpus * q] q, _ = matutils.qr_destroy( q) # orthonormalize the range after each power iteration step else: num_docs = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunksize) matrix! s = sum(len(doc) for doc in chunk) chunk = matutils.corpus2csc( chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o chunk.data, o.ravel(), y.ravel()) del chunk, o y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = q.copy() q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc( chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk q += tmp del yold q = [q] q, _ = matutils.qr_destroy(q) # orthonormalize the range qt = q[:, :samples].T.copy() del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = numpy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunksize` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype) b = qt * chunk # dense * sparse matrix multiply del chunk x += numpy.dot( b, b.T ) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd( x ) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt( s ) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus q = qt.T.copy() del qt logger.info("computing the final decomposition") keep = clip_spectrum(s**2, rank, discard=eps) u = u[:, :keep].copy() s = s[:keep] u = numpy.dot(q, u) return u.astype(dtype), s.astype(dtype)