Пример #1
0
def mapfunc_sampling(it, rank):
    import_tools()
    mtx = it.next()
    m, n = mtx.shape
    y = numpy.zeros(dtype=mtx.dtype, shape=(m, rank))
    o = numpy.random.normal(0.0, 1.0, (n, rank)).astype(y.dtype)
    sparsetools.csc_matvecs(m, n, rank, mtx.indptr, mtx.indices,
                            mtx.data, o.ravel(), y.ravel())
    del o
    return enumerate(y.T)
Пример #2
0
def mapfunc_sampling(it, rank):
    import_tools()
    mtx = it.next()
    m, n = mtx.shape
    y = numpy.zeros(dtype=mtx.dtype, shape=(m, rank))
    o = numpy.random.normal(0.0, 1.0, (n, rank)).astype(y.dtype)
    sparsetools.csc_matvecs(m, n, rank, mtx.indptr, mtx.indices,
                            mtx.data, o.ravel(), y.ravel())
    del o
    return enumerate(y.T)
Пример #3
0
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None,
                   power_iters=0, dtype=np.float64, eps=1e-6):
    """Run truncated Singular Value Decomposition (SVD) on a sparse input.

    Parameters
    ----------
    corpus : {iterable of list of (int, float), scipy.sparse}
        Input corpus as a stream (does not have to fit in RAM)
        or a sparse matrix of shape (`num_terms`, num_documents).
    rank : int
        Desired number of factors to be retained after decomposition.
    num_terms : int
        The number of features (terms) in `corpus`.
    chunksize :  int, optional
        Number of documents to be used in each training chunk.
    extra_dims : int, optional
        Extra samples to be used besides the rank `k`. Can improve accuracy.
    power_iters: int, optional
        Number of power iteration steps to be used. Increasing the number of power iterations improves accuracy,
        but lowers performance.
    dtype : numpy.dtype, optional
        Enforces a type for elements of the decomposed matrix.
    eps: float, optional
        Percentage of the spectrum's energy to be discarded.

    Notes
    -----
    The corpus may be larger than RAM (iterator of vectors), if `corpus` is a `scipy.sparse.csc` instead,
    it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen.
    This may return less than the requested number of top `rank` factors, in case the input itself is of lower rank.
    The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the
    decomposition.

    This algorithm uses `2 + power_iters` passes over the input data. In case you can only afford a single pass,
    set `onepass=True` in :class:`~gensim.models.lsimodel.LsiModel` and avoid using this function directly.

    The decomposition algorithm is based on `"Finding structure with randomness:
    Probabilistic algorithms for constructing approximate matrix decompositions" <https://arxiv.org/abs/0909.4061>`_.


    Returns
    -------
    (np.ndarray 2D, np.ndarray 1D)
        The left singular vectors and the singular values of the `corpus`.

    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(10, 2 * rank)  # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations", samples - rank, power_iters)

    num_terms = int(num_terms)

    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunksize` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = np.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st phase: constructing %s action matrix", str(y.shape))

    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms)
        o = np.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype)  # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices,
                                corpus.data, o.ravel(), y.ravel())  # y = corpus * o
        del o

        # unlike np, scipy.sparse `astype()` copies everything, even if there is no change to dtype!
        # so check for equal dtype explicitly, to avoid the extra memory footprint if possible
        if y.dtype != dtype:
            y = y.astype(dtype)

        logger.info("orthonormalizing %s action matrix", str(y.shape))
        y = [y]
        q, _ = matutils.qr_destroy(y)  # orthonormalize the range

        logger.debug("running %i power iterations", power_iters)
        for _ in range(power_iters):
            q = corpus.T * q
            q = [corpus * q]
            q, _ = matutils.qr_destroy(q)  # orthonormalize the range after each power iteration step
    else:
        num_docs = 0
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i', (chunk_no * chunksize))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunksize) matrix!
            s = sum(len(doc) for doc in chunk)
            chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype)  # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunksize  # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = np.random.normal(0.0, 1.0, (n, samples)).astype(dtype)  # draw a random gaussian matrix
            sparsetools.csc_matvecs(
                m, n, samples, chunk.indptr, chunk.indices,  # y = y + chunk * o
                chunk.data, o.ravel(), y.ravel()
            )
            del chunk, o
        y = [y]
        q, _ = matutils.qr_destroy(y)  # orthonormalize the range

        for power_iter in range(power_iters):
            logger.info("running power iteration #%i", power_iter + 1)
            yold = q.copy()
            q[:] = 0.0
            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs)
                # documents = columns of sparse CSC
                chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype)
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                q += tmp
            del yold
            q = [q]
            q, _ = matutils.qr_destroy(q)  # orthonormalize the range

    qt = q[:, :samples].T.copy()
    del q

    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix", str(b.shape))
        u, s, vt = scipy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunksize` documents from the streaming
        # input corpus A, to avoid using O(number of documents) memory
        x = np.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=dtype)
        logger.info("2nd phase: constructing %s covariance matrix", str(x.shape))
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs)
            chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype)
            b = qt * chunk  # dense * sparse matrix multiply
            del chunk
            x += np.dot(b, b.T)  # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del b

        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix", str(x.shape))
        # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        u, s, vt = scipy.linalg.svd(x)
        # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
        s = np.sqrt(s)
    q = qt.T.copy()
    del qt

    logger.info("computing the final decomposition")
    keep = clip_spectrum(s ** 2, rank, discard=eps)
    u = u[:, :keep].copy()
    s = s[:keep]
    u = np.dot(q, u)
    return u.astype(dtype), s.astype(dtype)
Пример #4
0
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None,
                  power_iters=0, dtype=numpy.float64, eps=1e-6):
    """
    Run truncated Singular Value Decomposition (SVD) on a sparse input.

    Return (U, S): the left singular vectors and the singular values of the input
    data stream `corpus` [4]_. The corpus may be larger than RAM (iterator of vectors).

    This may return less than the requested number of top `rank` factors, in case
    the input itself is of lower rank. The `extra_dims` (oversampling) and especially
    `power_iters` (power iterations) parameters affect accuracy of the decomposition.

    This algorithm uses `2+power_iters` passes over the input data. In case you can only
    afford a single pass, set `onepass=True` in :class:`LsiModel` and avoid using
    this function directly.

    The decomposition algorithm is based on
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**

    .. [4] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole
       corpus fits into core memory and a different (more efficient) code path is chosen.
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters))

    num_terms = int(num_terms)

    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunksize` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st phase: constructing %s action matrix" % str(y.shape))

    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms)
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices,
                                corpus.data, o.ravel(), y.ravel()) # y = corpus * o
        del o

        # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype!
        # so check for equal dtype explicitly, to avoid the extra memory footprint if possible
        if y.dtype != dtype:
            y = y.astype(dtype)

        logger.info("orthonormalizing %s action matrix" % str(y.shape))
        y = [y]
        q, _ = matutils.qr_destroy(y) # orthonormalize the range

        logger.debug("running %i power iterations" % power_iters)
        for power_iter in xrange(power_iters):
            q = corpus.T * q
            q = [corpus * q]
            q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step
    else:
        num_docs = 0
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunksize) matrix!
            s = sum(len(doc) for doc in chunk)
            chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunksize # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix
            sparsetools.csc_matvecs(m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o
                                    chunk.data, o.ravel(), y.ravel())
            del chunk, o
        y = [y]
        q, _ = matutils.qr_destroy(y) # orthonormalize the range

        for power_iter in xrange(power_iters):
            logger.info("running power iteration #%i" % (power_iter + 1))
            yold = q.copy()
            q[:] = 0.0
            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs))
                chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                q += tmp
            del yold
            q = [q]
            q, _ = matutils.qr_destroy(q) # orthonormalize the range

    qt = q[:, :samples].T.copy()
    del q

    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
        u, s, vt = scipy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunksize` documents from the streaming
        # input corpus A, to avoid using O(number of documents) memory
        x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64)
        logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape))
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs))
            chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype)
            b = qt * chunk # dense * sparse matrix multiply
            del chunk
            x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del b

        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix" % str(x.shape))
        u, s, vt = scipy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
    q = qt.T.copy()
    del qt

    logger.info("computing the final decomposition")
    keep = clip_spectrum(s**2, rank, discard=eps)
    u = u[:, :keep].copy()
    s = s[:keep]
    u = numpy.dot(q, u)
    return u.astype(dtype), s.astype(dtype)
Пример #5
0
def stochasticSvd(corpus,
                  rank,
                  num_terms,
                  chunks=20000,
                  extra_dims=None,
                  power_iters=0,
                  dtype=numpy.float64,
                  eps=1e-6):
    """
    Return (U, S): the left singular vectors and the singular values of the streamed 
    input corpus `corpus` [3]_.
    
    This may actually return less than the requested number of top `rank` factors, 
    in case the input is of lower rank. The `extra_dims` (oversampling) and especially
    `power_iters` (power iterations) parameters affect accuracy of the decomposition.
    
    This algorithm uses `2+power_iters` passes over the data. In case you can only 
    afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` 
    and avoid using this algorithm directly.

    The decomposition algorithm is based on 
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**
    
    .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole 
       corpus fits into core memory and a different (more efficient) code path is chosen.
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(
            10, 2 * rank
        )  # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations" %
                (samples - rank, power_iters))

    num_terms = int(num_terms)

    eps = max(
        float(eps), 1e-9
    )  # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage

    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunks` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st phase: constructing %s action matrix" % str(y.shape))

    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (
            m, num_terms)
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
            y.dtype)  # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr,
                                corpus.indices, corpus.data, o.ravel(),
                                y.ravel())  # y = corpus * o
        del o
        y = y.astype(
            dtype
        )  # TODO unlike numpy, scipy actually makes a copy even when dtype=y.dtype...marginally inefficient
        logger.debug("running %i power iterations" % power_iters)
        for power_iter in xrange(power_iters):
            y = corpus.T * y
            y = corpus * y
    else:
        chunker = itertools.groupby(enumerate(corpus),
                                    key=lambda (docno, doc): docno / chunks)
        num_docs = 0
        for chunk_no, (key, group) in enumerate(chunker):
            logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunks) matrix!
            chunk = matutils.corpus2csc(
                (doc for _, doc in group), num_terms=num_terms,
                dtype=dtype)  # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunks  # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
                dtype)  # draw a random gaussian matrix
            sparsetools.csc_matvecs(
                num_terms,
                n,
                samples,
                chunk.indptr,  # y = y + chunk * o
                chunk.indices,
                chunk.data,
                o.ravel(),
                y.ravel())
            del chunk, o

        for power_iter in xrange(power_iters):
            logger.info("running power iteration #%i" % (power_iter + 1))
            yold = y.copy()
            y[:] = 0.0
            chunker = itertools.groupby(enumerate(corpus),
                                        key=lambda
                                        (docno, doc): docno / chunks)
            for chunk_no, (key, group) in enumerate(chunker):
                logger.info('PROGRESS: at document #%i/%i' %
                            (chunk_no * chunks, num_docs))
                chunk = matutils.corpus2csc(
                    (doc for _, doc in group),
                    num_terms=num_terms,
                    dtype=dtype)  # documents = columns of sparse CSC
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                y += tmp
            del yold

    logger.info("orthonormalizing %s action matrix" % str(y.shape))
    y = [y]
    q, r = matutils.qr_destroy(y)  # orthonormalize the range
    del y
    samples = clipSpectrum(numpy.diag(r), samples, discard=eps)
    qt = numpy.asfortranarray(
        q[:, :samples].T
    )  # discard bogus columns, in case Y was rank-deficient
    del q

    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
        u, s, vt = numpy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunks` documents from the streaming
        # input corpus A, to avoid using O(number of documents) memory
        x = numpy.zeros(shape=(samples, samples), dtype=dtype)
        logger.info("2nd phase: constructing %s covariance matrix" %
                    str(x.shape))
        chunker = itertools.groupby(enumerate(corpus),
                                    key=lambda (docno, doc): docno / chunks)
        for chunk_no, (key, group) in enumerate(chunker):
            logger.info('PROGRESS: at document #%i/%i' %
                        (chunk_no * chunks, num_docs))
            chunk = matutils.corpus2csc((doc for _, doc in group),
                                        num_terms=num_terms,
                                        dtype=dtype)
            b = qt * chunk  # dense * sparse matrix multiply
            x += numpy.dot(
                b, b.T
            )  # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del chunk, b

        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix" %
                    str(x.shape))
        u, s, vt = numpy.linalg.svd(
            x
        )  # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        s = numpy.sqrt(
            s
        )  # sqrt to go back from singular values of X to singular values of B = singular values of the corpus

    logger.info("computing the final decomposition")
    keep = clipSpectrum(s**2, rank, discard=eps)
    u = numpy.asfortranarray(u[:, :keep])
    s = s[:keep]
    gemm = matutils.blas('gemm', u)
    u = gemm(1.0, qt, u, trans_a=True)
    return u, s
Пример #6
0
def stochasticSvd(corpus,
                  rank,
                  num_terms=None,
                  chunks=20000,
                  extra_dims=None,
                  dtype=numpy.float64,
                  eps=1e-6):
    """
    Return U, S -- the left singular vectors and the singular values of the streamed 
    input corpus.
    
    This may actually return less than the requested number of top `rank` factors, 
    in case the input is of lower rank. Also note that the decomposition is unique
    up the the sign of the left singular vectors (columns of U).
    
    This is a streamed, two-pass algorithm, without power-iterations. In case you can 
    only afford a single pass over the input corpus, set `onepass=True` in LsiModel and 
    avoid using this algorithm.

    The decomposition algorithm is based on 
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**
    
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(
            10, 2 * rank
        )  # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)

    if num_terms is None:
        logger.warning(
            "number of terms not provided; will scan the corpus (ONE EXTRA PASS, MAY BE SLOW) to determine it"
        )
        num_terms = len(utils.dictFromCorpus(corpus))
        logger.info("found %i terms" % num_terms)
    else:
        num_terms = int(num_terms)

    eps = max(
        float(eps), 1e-9
    )  # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage

    # first pass: construct the orthonormal action matrix Q = orth(Y) = orth(A * O)
    # build Y in blocks of `chunks` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st pass: constructing %s action matrix" % str(y.shape))

    chunker = itertools.groupby(enumerate(corpus),
                                key=lambda (docno, doc): docno / chunks)
    for chunk_no, (key, group) in enumerate(chunker):
        logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
        # construct the chunk as a sparse matrix, to minimize memory overhead
        # definitely avoid materializing it as a dense (num_terms x chunks) matrix!
        chunk = matutils.corpus2csc(
            (doc for _, doc in group), num_terms=num_terms,
            dtype=dtype)  # documents = columns of sparse CSC
        m, n = chunk.shape
        assert m == num_terms
        assert n <= chunks  # the very last chunk of A may be smaller
        logger.debug("multiplying chunk * gauss")
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
            dtype)  # draw a random gaussian matrix
        sparsetools.csc_matvecs(
            num_terms,
            n,
            samples,
            chunk.indptr,  # y = y + chunk * o
            chunk.indices,
            chunk.data,
            o.ravel(),
            y.ravel())
        del chunk, o

    logger.info("orthonormalizing %s action matrix" % str(y.shape))
    q, r = numpy.linalg.qr(y)  # orthonormalize the range
    del y  # Y not needed anymore, free up mem
    samples = clipSpectrum(numpy.diag(r), samples, discard=eps)
    qt = q[:, :samples].T.copy(
    )  # discard bogus columns, in case Y was rank-deficient
    del q

    # second pass: construct the covariance matrix X = B * B.T, where B = Q.T * A
    # again, construct X incrementally, in chunks of `chunks` documents from the streaming
    # input corpus A, to avoid using O(number of documents) memory
    x = numpy.zeros(shape=(samples, samples), dtype=dtype)
    logger.info("2nd pass: constructing %s covariance matrix" % str(x.shape))
    chunker = itertools.groupby(enumerate(corpus),
                                key=lambda (docno, doc): docno / chunks)
    for chunk_no, (key, group) in enumerate(chunker):
        logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
        chunk = matutils.corpus2csc((doc for _, doc in group),
                                    num_terms=num_terms,
                                    dtype=dtype)
        b = qt * chunk  # dense * sparse matrix multiply
        x += numpy.dot(
            b, b.T
        )  # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
        del chunk, b

    # now we're ready to compute decomposition of the small matrix X
    logger.info("computing decomposition of the %s covariance matrix" %
                str(x.shape))
    u, s, vt = numpy.linalg.svd(
        x
    )  # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
    keep = clipSpectrum(s, rank, discard=eps)

    logger.info("computing the final decomposition")
    s = numpy.sqrt(
        s[:keep]
    )  # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
    u = numpy.dot(
        qt.T, u[:, :keep]
    )  # go back from left singular vectors of B to left singular vectors of the corpus
    return u.astype(dtype), s.astype(dtype)
Пример #7
0
    def __mul__(self, other):
        """
        Matrix multiplication
        :param other: CscMat instance
        :return: CscMat instance
        """
        if isinstance(other, CscMat):  # mat-mat multiplication
            # 2-pass matrix multiplication
            Cp = np.empty(self.n + 1, dtype=np.int32)

            sptools.csc_matmat_pass1(self.n, other.m, self.indptr,
                                     self.indices, other.indptr, other.indices,
                                     Cp)
            nnz = Cp[-1]
            Ci = np.empty(nnz, dtype=np.int32)
            Cx = np.empty(nnz, dtype=np.float64)

            sptools.csc_matmat_pass2(self.n, other.m, self.indptr,
                                     self.indices, self.data, other.indptr,
                                     other.indices, other.data, Cp, Ci, Cx)

            return CscMat(n=self.m, m=other.m, indptr=Cp, indices=Ci, data=Cx)

        elif isinstance(
                other, np.ndarray):  # multiply by a vector or array of vectors

            if len(other.shape) == 1:
                y = np.zeros(self.m, dtype=np.float64)
                sptools.csc_matvec(self.m, self.n, self.indptr, self.indices,
                                   self.data, other, y)
                return y
            elif len(other.shape) == 2:
                '''
                
                 * Input Arguments:
                 *   I  n_row            - number of rows in A
                 *   I  n_col            - number of columns in A
                 *   I  n_vecs           - number of column vectors in X and Y
                 *   I  Ap[n_row+1]      - row pointer
                 *   I  Aj[nnz(A)]       - column indices
                 *   T  Ax[nnz(A)]       - nonzeros
                 *   T  Xx[n_col,n_vecs] - input vector
                 *
                 * Output Arguments:
                 *   T  Yx[n_row,n_vecs] - output vector
                 *
                 * Note:
                 *   Output array Yx must be preallocated
                 *
                
                void csc_matvecs(const I n_row,
                                 const I n_col,
                                 const I n_vecs,
                                 const I Ap[],
                                 const I Ai[],
                                 const T Ax[],
                                 const T Xx[],
                                       T Yx[])
                '''
                n_col, n_vecs = other.shape

                y = np.zeros((self.m, n_vecs), dtype=np.float64)
                sptools.csc_matvecs(self.m, self.n, n_vecs, self.indptr,
                                    self.indices, self.data, other, y)
                return y

        elif isinstance(other, float) or isinstance(
                other, int):  # multiply by a scalar value
            C = self.copy()
            C.data *= other
            return C

        else:
            raise Exception('Type not supported')
def stochasticSvd(corpus, rank, num_terms=None, chunks=20000, extra_dims=None, dtype=numpy.float64, eps=1e-6):
    """
    Return U, S -- the left singular vectors and the singular values of the streamed 
    input corpus.
    
    This may actually return less than the requested number of top `rank` factors, 
    in case the input is of lower rank. Also note that the decomposition is unique
    up the the sign of the left singular vectors (columns of U).
    
    This is a streamed, two-pass algorithm, without power-iterations. In case you can 
    only afford a single pass over the input corpus, set `onepass=True` in LsiModel and 
    avoid using this algorithm.

    The decomposition algorithm is based on 
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**
    
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    
    if num_terms is None:
        logger.warning("number of terms not provided; will scan the corpus (ONE EXTRA PASS, MAY BE SLOW) to determine it")
        num_terms = len(utils.dictFromCorpus(corpus))
        logger.info("found %i terms" % num_terms)
    else:
        num_terms = int(num_terms)
    
    eps = max(float(eps), 1e-9) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage
    
    # first pass: construct the orthonormal action matrix Q = orth(Y) = orth(A * O)
    # build Y in blocks of `chunks` documents (much faster than going one-by-one 
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype = dtype, shape = (num_terms, samples))
    logger.info("1st pass: constructing %s action matrix" % str(y.shape))
    
    chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
    for chunk_no, (key, group) in enumerate(chunker):
        logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
        # construct the chunk as a sparse matrix, to minimize memory overhead
        # definitely avoid materializing it as a dense (num_terms x chunks) matrix!
        chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
        m, n = chunk.shape
        assert m == num_terms
        assert n <= chunks # the very last chunk of A may be smaller
        logger.debug("multiplying chunk * gauss")
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix
        sparsetools.csc_matvecs(num_terms, n, samples, chunk.indptr, # y = y + chunk * o
                                chunk.indices, chunk.data, o.ravel(), y.ravel())
        del chunk, o
    
    logger.info("orthonormalizing %s action matrix" % str(y.shape))
    q, r = numpy.linalg.qr(y) # orthonormalize the range
    del y # Y not needed anymore, free up mem
    samples = clipSpectrum(numpy.diag(r), samples, discard = eps)
    qt = q[:, :samples].T.copy() # discard bogus columns, in case Y was rank-deficient
    del q
    
    # second pass: construct the covariance matrix X = B * B.T, where B = Q.T * A
    # again, construct X incrementally, in chunks of `chunks` documents from the streaming 
    # input corpus A, to avoid using O(number of documents) memory
    x = numpy.zeros(shape = (samples, samples), dtype = dtype)
    logger.info("2nd pass: constructing %s covariance matrix" % str(x.shape))
    chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
    for chunk_no, (key, group) in enumerate(chunker):
        logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
        chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype)
        b = qt * chunk # dense * sparse matrix multiply
        x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
        del chunk, b
    
    # now we're ready to compute decomposition of the small matrix X
    logger.info("computing decomposition of the %s covariance matrix" % str(x.shape))
    u, s, vt = numpy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
    keep = clipSpectrum(s, rank, discard = eps)
    
    logger.info("computing the final decomposition")
    s = numpy.sqrt(s[:keep]) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
    u = numpy.dot(qt.T, u[:, :keep]) # go back from left singular vectors of B to left singular vectors of the corpus
    return u.astype(dtype), s.astype(dtype)
Пример #9
0
def stochasticSvd(corpus, rank, num_terms, chunks=20000, extra_dims=None, 
                  power_iters=0, dtype=numpy.float64, eps=1e-6):
    """
    Return (U, S): the left singular vectors and the singular values of the streamed 
    input corpus `corpus` [3]_.
    
    This may actually return less than the requested number of top `rank` factors, 
    in case the input is of lower rank. The `extra_dims` (oversampling) and especially
    `power_iters` (power iterations) parameters affect accuracy of the decomposition.
    
    This algorithm uses `2+power_iters` passes over the data. In case you can only 
    afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` 
    and avoid using this algorithm directly.

    The decomposition algorithm is based on 
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**
    
    .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole 
       corpus fits into core memory and a different (more efficient) code path is chosen.
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters))
    
    num_terms = int(num_terms)
    
    eps = max(float(eps), 1e-9) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage
    
    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunks` documents (much faster than going one-by-one 
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype = dtype, shape = (num_terms, samples))
    logger.info("1st phase: constructing %s action matrix" % str(y.shape))
    
    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms)
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, 
                                corpus.data, o.ravel(), y.ravel()) # y = corpus * o
        del o
        y = y.astype(dtype) # TODO unlike numpy, scipy actually makes a copy even when dtype=y.dtype...marginally inefficient
        logger.debug("running %i power iterations" % power_iters)
        for power_iter in xrange(power_iters):
            y = corpus.T * y
            y = corpus * y
    else:
        chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
        num_docs = 0
        for chunk_no, (key, group) in enumerate(chunker):
            logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunks) matrix!
            chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunks # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix
            sparsetools.csc_matvecs(num_terms, n, samples, chunk.indptr, # y = y + chunk * o
                                    chunk.indices, chunk.data, o.ravel(), y.ravel())
            del chunk, o
        
        for power_iter in xrange(power_iters):
            logger.info("running power iteration #%i" % (power_iter + 1))
            yold = y.copy()
            y[:] = 0.0
            chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
            for chunk_no, (key, group) in enumerate(chunker):
                logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs))
                chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                y += tmp
            del yold
    
    logger.info("orthonormalizing %s action matrix" % str(y.shape))
    y = [y]
    q, r = matutils.qr_destroy(y) # orthonormalize the range
    del y
    samples = clipSpectrum(numpy.diag(r), samples, discard = eps)
    qt = numpy.asfortranarray(q[:, :samples].T) # discard bogus columns, in case Y was rank-deficient
    del q
    
    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
        u, s, vt = numpy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunks` documents from the streaming 
        # input corpus A, to avoid using O(number of documents) memory
        x = numpy.zeros(shape = (samples, samples), dtype = dtype)
        logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape))
        chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
        for chunk_no, (key, group) in enumerate(chunker):
            logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunks, num_docs))
            chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype)
            b = qt * chunk # dense * sparse matrix multiply
            x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del chunk, b
    
        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix" % str(x.shape))
        u, s, vt = numpy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
        
    logger.info("computing the final decomposition")
    keep = clipSpectrum(s**2, rank, discard=eps)
    u = numpy.asfortranarray(u[:, :keep])
    s = s[:keep]
    gemm = matutils.blas('gemm', u)
    u = gemm(1.0, qt, u, trans_a=True)
    return u, s
Пример #10
0
def stochastic_svd(corpus,
                   rank,
                   num_terms,
                   chunksize=20000,
                   extra_dims=None,
                   power_iters=0,
                   dtype=numpy.float64,
                   eps=1e-6):
    """
    Return (U, S): the left singular vectors and the singular values of the streamed
    input corpus `corpus` [3]_.

    This may actually return less than the requested number of top `rank` factors,
    in case the input is of lower rank. The `extra_dims` (oversampling) and especially
    `power_iters` (power iterations) parameters affect accuracy of the decomposition.

    This algorithm uses `2+power_iters` passes over the data. In case you can only
    afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel`
    and avoid using this algorithm directly.

    The decomposition algorithm is based on
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**

    .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole
       corpus fits into core memory and a different (more efficient) code path is chosen.
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(
            10, 2 * rank
        )  # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations" %
                (samples - rank, power_iters))

    num_terms = int(num_terms)

    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunksize` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st phase: constructing %s action matrix" % str(y.shape))

    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (
            m, num_terms)
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
            y.dtype)  # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr,
                                corpus.indices, corpus.data, o.ravel(),
                                y.ravel())  # y = corpus * o
        del o

        # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype!
        # so check for equal dtype explicitly, to avoid the extra memory footprint if possible
        if y.dtype != dtype:
            y = y.astype(dtype)

        logger.info("orthonormalizing %s action matrix" % str(y.shape))
        y = [y]
        q, _ = matutils.qr_destroy(y)  # orthonormalize the range

        logger.debug("running %i power iterations" % power_iters)
        for power_iter in xrange(power_iters):
            q = corpus.T * q
            q = [corpus * q]
            q, _ = matutils.qr_destroy(
                q)  # orthonormalize the range after each power iteration step
    else:
        num_docs = 0
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunksize) matrix!
            s = sum(len(doc) for doc in chunk)
            chunk = matutils.corpus2csc(
                chunk, num_terms=num_terms,
                dtype=dtype)  # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunksize  # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
                dtype)  # draw a random gaussian matrix
            sparsetools.csc_matvecs(
                m,
                n,
                samples,
                chunk.indptr,
                chunk.indices,  # y = y + chunk * o
                chunk.data,
                o.ravel(),
                y.ravel())
            del chunk, o
        y = [y]
        q, _ = matutils.qr_destroy(y)  # orthonormalize the range

        for power_iter in xrange(power_iters):
            logger.info("running power iteration #%i" % (power_iter + 1))
            yold = q.copy()
            q[:] = 0.0
            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                logger.info('PROGRESS: at document #%i/%i' %
                            (chunk_no * chunksize, num_docs))
                chunk = matutils.corpus2csc(
                    chunk, num_terms=num_terms,
                    dtype=dtype)  # documents = columns of sparse CSC
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                q += tmp
            del yold
            q = [q]
            q, _ = matutils.qr_destroy(q)  # orthonormalize the range

    qt = q[:, :samples].T.copy()
    del q

    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
        u, s, vt = numpy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunksize` documents from the streaming
        # input corpus A, to avoid using O(number of documents) memory
        x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64)
        logger.info("2nd phase: constructing %s covariance matrix" %
                    str(x.shape))
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i/%i' %
                        (chunk_no * chunksize, num_docs))
            chunk = matutils.corpus2csc(chunk,
                                        num_terms=num_terms,
                                        dtype=qt.dtype)
            b = qt * chunk  # dense * sparse matrix multiply
            del chunk
            x += numpy.dot(
                b, b.T
            )  # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del b

        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix" %
                    str(x.shape))
        u, s, vt = numpy.linalg.svd(
            x
        )  # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        s = numpy.sqrt(
            s
        )  # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
    q = qt.T.copy()
    del qt

    logger.info("computing the final decomposition")
    keep = clip_spectrum(s**2, rank, discard=eps)
    u = u[:, :keep].copy()
    s = s[:keep]
    u = numpy.dot(q, u)
    return u.astype(dtype), s.astype(dtype)