def svdAddCols(self, docs, decay=1.0, reorth=False):
        """
        Update singular value decomposition factors to take into account new 
        documents `docs`.
        
        This function corresponds to the general update of Brand (section 2), 
        specialized for `A = docs.T` and `B` trivial (no update to matrix rows).

        The documents are assumed to be a list of full vectors (ie. not sparse 2-tuples).
        
        Compute new decomposition `u'`, `s'`, `v'` so that if the current matrix `X` decomposes to
        `u * s * v^T ~= X`, then
        `u' * s' * v'^T ~= [X docs^T]`
        
        `u`, `s`, `v` and their new values `u'`, `s'`, `v'` are stored within `self` (ie. as 
        `self.u`, `self.v` etc.).
        
        `self.v` can be set to `None`, in which case it is completely ignored. This saves a
        bit of speed and a lot of memory, especially for huge corpora (size of `v` is
        linear in the number of added documents).
        """
        logging.debug("updating SVD with %i new documents" % len(docs))
        keepV = self.v is not None
        if not keepV and reorth:
            raise TypeError("cannot reorthogonalize without the right singular vectors (v must not be None)")
        a = numpy.matrix(numpy.asarray(docs)).T
        m, k = self.u.shape
        if keepV:
            n, k2 = self.v.shape
            assert k == k2, "left/right singular vectors shape mismatch!"
        m2, c = a.shape
        assert m == m2, "new documents must be in the same term-space as the original documents (old %s, new %s)" % (
            self.u.shape,
            a.shape,
        )

        # construct orthogonal basis for (I - U * U^T) * A
        logging.debug("constructing orthogonal component")
        m = self.u.T * a  # (k, m) * (m, c) = (k, c)
        logging.debug("computing orthogonal basis")
        P, Ra = numpy.linalg.qr(a - self.u * m)  # equation (2)

        # allow re-orientation towards new data trends in the document stream, by giving less emphasis on old values
        self.s *= decay

        # now we're ready to construct K; K will be mostly diagonal and sparse, with
        # lots of structure, and of shape only (k + c, k + c), so its direct SVD
        # ought to be fast for reasonably small additions of new documents (ie. tens
        # or hundreds of new documents at a time).
        empty = matutils.pad(numpy.matrix([]).reshape(0, 0), c, k)
        K = numpy.bmat([[self.s, m], [empty, Ra]])  # (k + c, k + c), equation (4)
        logging.debug("computing %s SVD" % str(K.shape))
        uK, sK, vK = numpy.linalg.svd(
            K, full_matrices=False
        )  # there is no python wrapper for partial svd => request all k + c factors :(
        lost = 1.0 - numpy.sum(sK[:k]) / numpy.sum(sK)
        logging.debug("discarding %.1f%% of data variation" % (100 * lost))

        # clip full decomposition to the requested rank
        uK = numpy.matrix(uK[:, :k])
        sK = numpy.matrix(numpy.diag(sK[:k]))
        vK = numpy.matrix(
            vK.T[:, :k]
        )  # .T because numpy transposes the right vectors V, so we need to transpose it back: V.T.T = V

        # and finally update the left/right singular vectors
        logging.debug("rotating subspaces")
        self.s = sK

        # update U piece by piece, to avoid creating (huge) temporary arrays in a complex expression
        P = P * uK[k:]
        self.u = self.u * uK[:k]
        self.u += P  # (m, k) * (k, k) + (m, c) * (c, k) = (m, k), equation (5)
        del P  # free up memory

        if keepV:
            self.v = self.v * vK[:k, :]  # (n + c, k) * (k, k) = (n + c, k)
            rot = vK[k:, :]
            self.v = numpy.bmat([[self.v], [rot]])

            if reorth:
                # The original article contains section 4.2 on keeping the rotations separate
                # from the subspaces (decomping V into Vsubspace * Vrotate), which further reduces
                # complexity and improves numerical properties for rank-1 updates.
                #
                # I did not implement this step yet; instead, force the (expensive)
                # reorthogonalization explicitly from time to time, by setting reorth = True
                logging.debug("re-orthogonalizing singular vectors")
                uQ, uR = numpy.linalg.qr(self.u)
                vQ, vR = numpy.linalg.qr(self.v)
                uK, sK, vK = numpy.linalg.svd(uR * self.s * vR.T, full_matrices=False)
                uK = numpy.matrix(uK[:, :k])
                sK = numpy.matrix(numpy.diag(sK[:k]))
                vK = numpy.matrix(vK.T[:, :k])

                logging.debug(
                    "adjusting singular values by %f%%"
                    % (100.0 * numpy.sum(numpy.abs(self.s - sK)) / numpy.sum(numpy.abs(self.s)))
                )
                self.u = uQ * uK
                self.s = sK
                self.v = vQ * vK
        logging.debug("added %i documents" % len(docs))
    def svdAddCols(self, docs, decay=1.0, reorth=False):
        """
        If `X = self.u * self.s * self.v^T` is the current decomposition,
        update it so that `self.u * self.s * self.v^T = [X docs.T]`,
        that is, append new columns to the original matrix.
        
        `docs` is a **dense** matrix containing the new observations as rows.
        """
        keepV = self.v is not None
        if not keepV and reorth:
            raise TypeError(
                "cannot reorthogonalize without the right singular vectors (v must not be None)"
            )
        a = numpy.asmatrix(numpy.asarray(docs)).T
        m, k = self.u.shape
        if keepV:
            n, k2 = self.v.shape
            assert k == k2, "left/right singular vectors shape mismatch!"
        m2, c = a.shape
        assert m == m2, "new documents must be in the same term-space as the original documents (old %s, new %s)" % (
            self.u.shape, a.shape)

        # construct orthogonal basis for (I - U * U^T) * A
        logging.debug("constructing orthogonal component")
        m = self.u.T * a  # project documents into eigenspace; (k, m) * (m, c) = (k, c)
        logging.debug("computing orthogonal basis")
        P, Ra = numpy.linalg.qr(a - self.u * m)  # equation (2)

        # allow re-orientation towards new data trends in the document stream, by giving less emphasis on old values
        self.s *= decay

        # now we're ready to construct K; K will be mostly diagonal and sparse, with
        # lots of structure, and of shape only (k + c, k + c), so its direct SVD
        # ought to be fast for reasonably small additions of new documents (ie. tens
        # or hundreds of new documents at a time).
        empty = matutils.pad(numpy.matrix([]).reshape(0, 0), c, k)
        K = numpy.bmat([[self.s, m], [empty,
                                      Ra]])  # (k + c, k + c), equation (4)
        logging.debug("computing %s SVD" % str(K.shape))
        uK, sK, vK = numpy.linalg.svd(
            K, full_matrices=False
        )  # there is no python linalg wrapper for partial svd => request all k + c factors :(
        lost = 1.0 - numpy.sum(sK[:k]) / numpy.sum(sK)
        logging.debug("discarding %.1f%% of data variation" % (100 * lost))

        # clip full decomposition to the requested rank
        uK = numpy.matrix(uK[:, :k])
        sK = numpy.matrix(numpy.diag(sK[:k]))
        vK = numpy.matrix(
            vK.T[:, :k]
        )  # .T because numpy transposes the right vectors V, so we need to transpose it back: V.T.T = V

        # and finally update the left/right singular vectors
        logging.debug('rotating subspaces')
        self.s = sK

        # update U piece by piece, to avoid creating (huge) temporary arrays in a complex expression and running out of memory
        P = P * uK[k:]
        self.u = self.u * uK[:k]
        self.u += P  # (m, k) * (k, k) + (m, c) * (c, k) = (m, k), equation (5)
        del P  # free up memory

        if keepV:
            self.v = self.v * vK[:k, :]  # (n + c, k) * (k, k) = (n + c, k)
            rot = vK[k:, :]
            self.v = numpy.bmat([[self.v], [rot]])

            if reorth:
                logging.debug("re-orthogonalizing the decomposition")
                uQ, uR = numpy.linalg.qr(self.u)
                vQ, vR = numpy.linalg.qr(self.v)
                uK, sK, vK = numpy.linalg.svd(uR * self.s * vR.T,
                                              full_matrices=False)
                uK = numpy.matrix(uK[:, :k])
                sK = numpy.matrix(numpy.diag(sK[:k]))
                vK = numpy.matrix(vK.T[:, :k])

                logging.debug("adjusting singular values by %f%%" %
                              (100.0 * numpy.sum(numpy.abs(self.s - sK)) /
                               numpy.sum(numpy.abs(self.s))))
                self.u = uQ * uK
                self.s = sK
                self.v = vQ * vK
        logging.debug("added %i documents" % len(docs))
예제 #3
0
    def merge(self, other, decay=1.0):
        """Merge current :class:`~gensim.models.lsimodel.Projection` instance with another.

        Warnings
        --------
        The content of `other` is destroyed in the process, so pass this function a copy of `other`
        if you need it further. The `other` :class:`~gensim.models.lsimodel.Projection` is expected to contain
        the same number of features.

        Parameters
        ----------
        other : :class:`~gensim.models.lsimodel.Projection`
            The Projection object to be merged into the current one. It will be destroyed after merging.
        decay : float, optional
            Weight of existing observations relatively to new ones.
            Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream,
            by giving less emphasis to old observations. This allows LSA to gradually "forget" old observations
            (documents) and give more preference to new ones.

        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            self.u = other.u.copy()
            self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError(
                "vector space mismatch: update is using %s features, expected %s" % (other.m, self.m)
            )
        logger.info("merging projections: %s + %s", str(self.u.shape), str(other.u.shape))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with ORGQR.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in scipy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        logger.debug("constructing orthogonal component")
        self.u = asfarray(self.u, 'self.u')
        c = np.dot(self.u.T, other.u)
        self.u = ascarray(self.u, 'self.u')
        other.u -= np.dot(self.u, c)

        other.u = [other.u]  # do some reference magic and call qr_destroy, to save RAM
        q, r = matutils.qr_destroy(other.u)  # q, r = QR(component)
        assert not other.u

        # find the rotation that diagonalizes r
        k = np.bmat([
            [np.diag(decay * self.s), np.multiply(c, other.s)],
            [matutils.pad(np.array([]).reshape(0, 0), min(m, n2), n1), np.multiply(r, other.s)]
        ])
        logger.debug("computing SVD of %s dense matrix", k.shape)
        try:
            # in np < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
            # for these early versions of np, catch the error and try to compute
            # SVD again, but over k*k^T.
            # see http://www.mail-archive.com/[email protected]/msg07224.html and
            # bug ticket http://projects.scipy.org/np/ticket/706
            # sdoering: replaced np's linalg.svd with scipy's linalg.svd:

            # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper
            # for partial svd/eigendecomp in np :( //sdoering: maybe there is one in scipy?
            u_k, s_k, _ = scipy.linalg.svd(k, full_matrices=False)
        except scipy.linalg.LinAlgError:
            logger.error("SVD(A) failed; trying SVD(A * A^T)")
            # if this fails too, give up with an exception
            u_k, s_k, _ = scipy.linalg.svd(np.dot(k, k.T), full_matrices=False)
            s_k = np.sqrt(s_k)  # go back from eigen values to singular values

        k = clip_spectrum(s_k ** 2, self.k)
        u1_k, u2_k, s_k = np.array(u_k[:n1, :k]), np.array(u_k[n1:, :k]), s_k[:k]

        # update & rotate current basis U = [U, U']*[U1_k, U2_k]
        logger.debug("updating orthonormal basis U")
        self.s = s_k
        self.u = ascarray(self.u, 'self.u')
        self.u = np.dot(self.u, u1_k)

        q = ascarray(q, 'q')
        q = np.dot(q, u2_k)
        self.u += q

        # make each column of U start with a non-negative number (to force canonical decomposition)
        if self.u.shape[0] > 0:
            for i in range(self.u.shape[1]):
                if self.u[0, i] < 0.0:
                    self.u[:, i] *= -1.0
예제 #4
0
파일: lsimodel.py 프로젝트: AmitShah/gensim
    def merge(self, other, decay=1.0):
        """
        Merge this Projection with another.

        The content of `other` is destroyed in the process, so pass this function a
        copy of `other` if you need it further.
        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            self.u = other.u.copy()
            self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError("vector space mismatch: update is using %s features, expected %s" %
                             (other.m, self.m))
        logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape)))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with ORGQR.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in scipy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        logger.debug("constructing orthogonal component")
        self.u = asfarray(self.u, 'self.u')
        c = numpy.dot(self.u.T, other.u)
        self.u = ascarray(self.u, 'self.u')
        other.u -= numpy.dot(self.u, c)

        other.u = [other.u] # do some reference magic and call qr_destroy, to save RAM
        q, r = matutils.qr_destroy(other.u) # q, r = QR(component)
        assert not other.u

        # find the rotation that diagonalizes r
        k = numpy.bmat([[numpy.diag(decay * self.s), numpy.multiply(c, other.s)],
                        [matutils.pad(numpy.array([]).reshape(0, 0), min(m, n2), n1), numpy.multiply(r, other.s)]])
        logger.debug("computing SVD of %s dense matrix" % str(k.shape))
        try:
            # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
            # for these early versions of numpy, catch the error and try to compute
            # SVD again, but over k*k^T.
            # see http://www.mail-archive.com/[email protected]/msg07224.html and
            # bug ticket http://projects.scipy.org/numpy/ticket/706
            # sdoering: replaced numpy's linalg.svd with scipy's linalg.svd:
            u_k, s_k, _ = scipy.linalg.svd(k, full_matrices=False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( //sdoering: maybe there is one in scipy?
        except scipy.linalg.LinAlgError:
            logger.error("SVD(A) failed; trying SVD(A * A^T)")
            u_k, s_k, _ = scipy.linalg.svd(numpy.dot(k, k.T), full_matrices=False) # if this fails too, give up with an exception
            s_k = numpy.sqrt(s_k) # go back from eigen values to singular values

        k = clip_spectrum(s_k**2, self.k)
        u1_k, u2_k, s_k = numpy.array(u_k[:n1, :k]), numpy.array(u_k[n1:, :k]), s_k[:k]

        # update & rotate current basis U = [U, U']*[U1_k, U2_k]
        logger.debug("updating orthonormal basis U")
        self.s = s_k
        self.u = ascarray(self.u, 'self.u')
        self.u = numpy.dot(self.u, u1_k)

        q = ascarray(q, 'q')
        q = numpy.dot(q, u2_k)
        self.u += q

        # make each column of U start with a non-negative number (to force canonical decomposition)
        if self.u.shape[0] > 0:
            for i in xrange(self.u.shape[1]):
                if self.u[0, i] < 0.0:
                    self.u[:, i] *= -1.0
예제 #5
0
    def svdAddCols(self, docs, decay = 1.0, reorth = False):
        """
        Update singular value decomposition factors to take into account new 
        documents `docs`.
        
        This function corresponds to the general update of Brand (section 2), 
        specialized for A = docs.T and B trivial (no update to matrix rows).

        The documents are assumed to be a list of full vectors (ie. not sparse 2-tuples).
        
        Compute new decomposition u', s', v' so that if the current matrix X decomposes to
        u * s * v^T ~= X, then
        u' * s' * v'^T ~= [X docs^T]
        
        u, s, v and their new values u', s', v' are stored within self (ie. as 
        self.u, self.v etc.).
        
        self.v can be set to None, in which case it is completely ignored. This saves a
        bit of speed and a lot of memory, especially for huge corpora (size of v is
        linear in the number of added documents).
        """
        logging.debug("updating SVD with %i new documents" % len(docs))
        keepV = self.v is not None
        if not keepV and reorth:
            raise TypeError("cannot reorthogonalize without the right singular vectors (v must not be None)")
        a = numpy.matrix(numpy.asarray(docs)).T
        m, k = self.u.shape
        if keepV:
            n, k2 = self.v.shape
            assert k == k2, "left/right singular vectors shape mismatch!"
        m2, c = a.shape
        assert m == m2, "new documents must be in the same term-space as the original documents (old %s, new %s)" % (u.shape, a.shape)
        
        # construct orthogonal basis for (I - U * U^T) * A
        logging.debug("constructing orthogonal component")
        m = self.u.T * a # (k, m) * (m, c) = (k, c)
        logging.debug("computing orthogonal basis")
        P, Ra = numpy.linalg.qr(a - self.u * m) # equation (2)
        self.u = numpy.bmat([self.u, P]) # (m, k + c)
        del P # free up mem

        # allow re-orientation towards new data trends in the document stream, by giving less emphasis on old values
        self.s *= decay
        
        # now we're ready to construct K; K will be mostly diagonal and sparse, with
        # lots of structure, and of shape only (k + c, k + c), so its direct SVD 
        # ought to be fast for reasonably small additions of new documents (ie. tens 
        # or hundreds of new documents at a time).
        empty = matutils.pad(numpy.matrix([]).reshape(0, 0), c, k)
        K = numpy.bmat([[self.s, m], [empty, Ra]]) # (k + c, k + c), equation (4)
        logging.debug("computing %s SVD" % str(K.shape))
        uK, sK, vK = numpy.linalg.svd(K, full_matrices = False) # there is no python wrapper for partial svd => request all k + c factors :(
        lost = 1.0 - numpy.sum(sK[: k]) / numpy.sum(sK)
        logging.debug("discarding %.1f%% of data variation" % (100 * lost))
        
        # clip full decomposition to the requested rank
        uK = numpy.matrix(uK[:, :k])
        sK = numpy.matrix(numpy.diag(sK[: k]))
        vK = numpy.matrix(vK.T[:, :k]) # .T because numpy transposes the right vectors V, so we need to transpose it back: V.T.T = V
        
        # and finally update the left/right singular vectors
        logging.debug('rotating subspaces')
        self.s = sK
        self.u = self.u * uK # (m, k + c) * (k + c, k) = (m, k), equation (5)
        if keepV:
            self.v = self.v * vK[:k, :] # (n + c, k) * (k, k) = (n + c, k)
            rot = vK[k:, :]
            self.v = numpy.bmat([[self.v], [rot]])
            
            if reorth:
                # The original article contains section 4.2 on keeping the rotations separate
                # from the subspaces (decomping V into Vsubspace * Vrotate), which further reduces 
                # complexity and improves numerical properties for rank-1 updates.
                #
                # I did not implement this step yet; instead, force the (expensive)
                # reorthogonalization explicitly from time to time, by setting reorth = True
                logging.debug("re-orthogonalizing singular vectors")
                uQ, uR = numpy.linalg.qr(self.u)
                vQ, vR = numpy.linalg.qr(self.v)
                uK, sK, vK = numpy.linalg.svd(uR * self.s * vR.T, full_matrices = False)
                uK = numpy.matrix(uK[:, :k])
                sK = numpy.matrix(numpy.diag(sK[: k]))
                vK = numpy.matrix(vK.T[:, :k])
                
                logging.debug("adjusting singular values by %f%%" % 
                              (100.0 * numpy.sum(numpy.abs(self.s - sK)) / numpy.sum(numpy.abs(self.s))))
                self.u = uQ * uK
                self.s = sK
                self.v = vQ * vK
        logging.debug("added %i documents" % len(docs))
예제 #6
0
파일: lsimodel.py 프로젝트: sxjzwq/gensim
    def merge(self, other, decay=1.0):
        """
        Merge this Projection with another.

        The content of `other` is destroyed in the process, so pass this function a
        copy of `other` if you need it further.
        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            self.u = other.u.copy()
            self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError(
                "vector space mismatch: update is using %s features, expected %s"
                % (other.m, self.m))
        logger.info("merging projections: %s + %s" %
                    (str(self.u.shape), str(other.u.shape)))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with ORGQR.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in scipy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        logger.debug("constructing orthogonal component")
        self.u = asfarray(self.u, 'self.u')
        c = numpy.dot(self.u.T, other.u)
        self.u = ascarray(self.u, 'self.u')
        other.u -= numpy.dot(self.u, c)

        other.u = [
            other.u
        ]  # do some reference magic and call qr_destroy, to save RAM
        q, r = matutils.qr_destroy(other.u)  # q, r = QR(component)
        assert not other.u

        # find the rotation that diagonalizes r
        k = numpy.bmat(
            [[numpy.diag(decay * self.s),
              numpy.multiply(c, other.s)],
             [
                 matutils.pad(numpy.array([]).reshape(0, 0), min(m, n2), n1),
                 numpy.multiply(r, other.s)
             ]])
        logger.debug("computing SVD of %s dense matrix" % str(k.shape))
        try:
            # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
            # for these early versions of numpy, catch the error and try to compute
            # SVD again, but over k*k^T.
            # see http://www.mail-archive.com/[email protected]/msg07224.html and
            # bug ticket http://projects.scipy.org/numpy/ticket/706
            # sdoering: replaced numpy's linalg.svd with scipy's linalg.svd:
            u_k, s_k, _ = scipy.linalg.svd(
                k, full_matrices=False
            )  # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( //sdoering: maybe there is one in scipy?
        except scipy.linalg.LinAlgError:
            logger.error("SVD(A) failed; trying SVD(A * A^T)")
            u_k, s_k, _ = scipy.linalg.svd(
                numpy.dot(k, k.T), full_matrices=False
            )  # if this fails too, give up with an exception
            s_k = numpy.sqrt(
                s_k)  # go back from eigen values to singular values

        k = clip_spectrum(s_k**2, self.k)
        u1_k, u2_k, s_k = numpy.array(u_k[:n1, :k]), numpy.array(
            u_k[n1:, :k]), s_k[:k]

        # update & rotate current basis U = [U, U']*[U1_k, U2_k]
        logger.debug("updating orthonormal basis U")
        self.s = s_k
        self.u = ascarray(self.u, 'self.u')
        self.u = numpy.dot(self.u, u1_k)

        q = ascarray(q, 'q')
        q = numpy.dot(q, u2_k)
        self.u += q

        # make each column of U start with a non-negative number (to force canonical decomposition)
        if self.u.shape[0] > 0:
            for i in xrange(self.u.shape[1]):
                if self.u[0, i] < 0.0:
                    self.u[:, i] *= -1.0
예제 #7
0
    def merge(self, other, decay=1.0):
        """Merge current :class:`~gensim.models.lsimodel.Projection` instance with another.

        Warnings
        --------
        The content of `other` is destroyed in the process, so pass this function a copy of `other`
        if you need it further. The `other` :class:`~gensim.models.lsimodel.Projection` is expected to contain
        the same number of features.

        Parameters
        ----------
        other : :class:`~gensim.models.lsimodel.Projection`
            The Projection object to be merged into the current one. It will be destroyed after merging.
        decay : float, optional
            Weight of existing observations relatively to new ones.
            Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream,
            by giving less emphasis to old observations. This allows LSA to gradually "forget" old observations
            (documents) and give more preference to new ones.

        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            self.u = other.u.copy()
            self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError(
                "vector space mismatch: update is using %s features, expected %s" % (other.m, self.m)
            )
        logger.info("merging projections: %s + %s", str(self.u.shape), str(other.u.shape))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with ORGQR.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in scipy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        logger.debug("constructing orthogonal component")
        self.u = asfarray(self.u, 'self.u')
        c = np.dot(self.u.T, other.u)
        self.u = ascarray(self.u, 'self.u')
        other.u -= np.dot(self.u, c)

        other.u = [other.u]  # do some reference magic and call qr_destroy, to save RAM
        q, r = matutils.qr_destroy(other.u)  # q, r = QR(component)
        assert not other.u

        # find the rotation that diagonalizes r
        k = np.bmat([
            [np.diag(decay * self.s), np.multiply(c, other.s)],
            [matutils.pad(np.array([]).reshape(0, 0), min(m, n2), n1), np.multiply(r, other.s)]
        ])
        logger.debug("computing SVD of %s dense matrix", k.shape)
        try:
            # in np < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
            # for these early versions of np, catch the error and try to compute
            # SVD again, but over k*k^T.
            # see http://www.mail-archive.com/[email protected]/msg07224.html and
            # bug ticket http://projects.scipy.org/np/ticket/706
            # sdoering: replaced np's linalg.svd with scipy's linalg.svd:

            # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper
            # for partial svd/eigendecomp in np :( //sdoering: maybe there is one in scipy?
            u_k, s_k, _ = scipy.linalg.svd(k, full_matrices=False)
        except scipy.linalg.LinAlgError:
            logger.error("SVD(A) failed; trying SVD(A * A^T)")
            # if this fails too, give up with an exception
            u_k, s_k, _ = scipy.linalg.svd(np.dot(k, k.T), full_matrices=False)
            s_k = np.sqrt(s_k)  # go back from eigen values to singular values

        k = clip_spectrum(s_k ** 2, self.k)
        u1_k, u2_k, s_k = np.array(u_k[:n1, :k]), np.array(u_k[n1:, :k]), s_k[:k]

        # update & rotate current basis U = [U, U']*[U1_k, U2_k]
        logger.debug("updating orthonormal basis U")
        self.s = s_k
        self.u = ascarray(self.u, 'self.u')
        self.u = np.dot(self.u, u1_k)

        q = ascarray(q, 'q')
        q = np.dot(q, u2_k)
        self.u += q

        # make each column of U start with a non-negative number (to force canonical decomposition)
        if self.u.shape[0] > 0:
            for i in range(self.u.shape[1]):
                if self.u[0, i] < 0.0:
                    self.u[:, i] *= -1.0
예제 #8
0
    def merge(self, other, decay=1.0):
        """
        Merge this Projection with another. 
        
        Content of `other` is destroyed in the process, so pass this function a 
        copy if you need it further.
        
        This is the optimized merge described in algorithm 5.
        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            if other.s is None:
                # other.u contains a direct document chunk, not svd => perform svd
                docs = other.u
                assert scipy.sparse.issparse(docs)
                if self.m * self.k < 10000:
                    # SVDLIBC gives spurious results for small matrices.. run full
                    # LAPACK on them instead
                    logger.info("computing dense SVD of %s matrix" %
                                str(docs.shape))
                    u, s, vt = numpy.linalg.svd(docs.todense(),
                                                full_matrices=False)
                else:
                    try:
                        import sparsesvd
                    except ImportError:
                        raise ImportError(
                            "for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`"
                        )
                    logger.info("computing sparse SVD of %s matrix" %
                                str(docs.shape))
                    ut, s, vt = sparsesvd.sparsesvd(
                        docs, self.k + 30
                    )  # ask for a few extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                    u = ut.T
                    del ut
                del vt
                k = clipSpectrum(s, self.k)
                self.u = u[:, :k].copy('F')
                self.s = s[:k]
            else:
                self.u = other.u.copy('F')
                self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError(
                "vector space mismatch: update has %s features, expected %s" %
                (other.m, self.m))
        logger.info("merging projections: %s + %s" %
                    (str(self.u.shape), str(other.u.shape)))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        if other.s is None:
            other.u = other.u.todense()
            other.s = 1.0  # broadcasting will promote this to eye(n2) where needed
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with gorgqr.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in numpy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        # IMPORTANT: keep matrices in suitable order for matrix products; failing to do so gives 8x lower performance :(
        self.u = numpy.asfortranarray(
            self.u)  # does nothing if input already fortran-order array
        other.u = numpy.asfortranarray(other.u)
        gemm, = get_blas_funcs(('gemm', ), (self.u, ))
        logger.debug("constructing orthogonal component")
        c = gemm(1.0, self.u, other.u, trans_a=True)
        gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True)

        # perform q, r = QR(component); code hacked out of scipy.linalg.qr
        logger.debug("computing QR of %s dense matrix" % str(other.u.shape))
        geqrf, = get_lapack_funcs(('geqrf', ), (other.u, ))
        qr, tau, work, info = geqrf(
            other.u, lwork=-1,
            overwrite_a=True)  # sometimes segfaults with overwrite_a=True...?
        qr, tau, work, info = geqrf(
            other.u, lwork=work[0],
            overwrite_a=True)  # sometimes segfaults with overwrite_a=True...?
        del other.u
        assert info >= 0
        r = triu(qr[:n2, :n2])
        if m < n2:  # rare case, #features < #topics
            qr = qr[:, :m]  # retains fortran order
        gorgqr, = get_lapack_funcs(('orgqr', ), (qr, ))
        q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True)
        q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True)
        assert info >= 0, "qr failed"
        assert q.flags.f_contiguous

        # find rotation that diagonalizes r
        k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s],
                        [
                            matutils.pad(
                                numpy.matrix([]).reshape(0, 0), min(m, n2),
                                n1), r * other.s
                        ]])
        logger.debug("computing SVD of %s dense matrix" % str(k.shape))
        u_k, s_k, _ = numpy.linalg.svd(
            k, full_matrices=False
        )  # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(

        k = clipSpectrum(s_k, self.k)
        u_k, s_k = u_k[:, :k], s_k[:k]

        # update & rotate current basis U
        logger.debug("updating orthonormal basis U")
        self.u = gemm(
            1.0, self.u, u_k[:n1]
        )  # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
        gemm(1.0, q, u_k[n1:], beta=1.0, c=self.u,
             overwrite_c=True)  # u = [u,u']*u_k
        self.s = s_k
    def merge(self, other, decay = 1.0):
        """
        Merge this Projection with another. 
        
        Content of `other` is destroyed in the process, so pass this function a 
        copy if you need it further.
        
        This is the optimized merge described in algorithm 5.
        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            self.u = other.u.copy()
            self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError("vector space mismatch: update has %s features, expected %s" %
                             (other.m, self.m))
        logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape)))
#        diff = numpy.dot(self.u.T, self.u) - numpy.eye(self.u.shape[1])
#        logger.info('orth error after=%f' % numpy.sum(diff * diff))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        # TODO Maybe keep the bases as elementary reflectors, without 
        # forming explicit matrices with gorgqr.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in numpy? And is it fast(er)?
        
        # find component of u2 orthogonal to u1
        # IMPORTANT: keep matrices in suitable order for matrix products; failing to do so gives 8x lower performance :(
        self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array
        other.u = numpy.asfortranarray(other.u)
        gemm, = get_blas_funcs(('gemm',), (self.u,))
        logger.debug("constructing orthogonal component")
        c = gemm(1.0, self.u, other.u, trans_a = True)
        gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True)
        
        # perform q, r = QR(component); code hacked out of scipy.linalg.qr
        logger.debug("computing QR of %s dense matrix" % str(other.u.shape))
        geqrf, = get_lapack_funcs(('geqrf',), (other.u,))
        qr, tau, work, info = geqrf(other.u, lwork = -1, overwrite_a = True) # sometimes segfaults with overwrite_a=True...
        qr, tau, work, info = geqrf(other.u, lwork = work[0], overwrite_a = True) # sometimes segfaults with overwrite_a=True...
        del other.u
        assert info >= 0
        r = triu(qr[:n2, :n2])
        if m < n2: # rare case...
            qr = qr[:,:m] # retains fortran order
        gorgqr, = get_lapack_funcs(('orgqr',), (qr,))
        q, work, info = gorgqr(qr, tau, lwork = -1, overwrite_a = True)
        q, work, info = gorgqr(qr, tau, lwork = work[0], overwrite_a = True)
        assert info >= 0, "qr failed"
        assert q.flags.f_contiguous
        
        # find rotation that diagonalizes r
        k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), n2, n1), r * other.s]])
        logger.debug("computing SVD of %s dense matrix" % str(k.shape))
        u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(
        
        k = clipSpectrum(s_k, self.k)
        u_k, s_k = u_k[:, :k], s_k[:k]
        
        # update & rotate current basis U
        logger.debug("updating orthonormal basis U")
        self.u = gemm(1.0, self.u, u_k[:n1]) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
        gemm(1.0, q, u_k[n1:], beta = 1.0, c = self.u, overwrite_c = True) # u = [u,u']*u_k
        self.s = s_k
예제 #10
0
파일: lsimodel.py 프로젝트: andremi/gensim
    def merge(self, other, decay=1.0):
        """
        Merge this Projection with another. 
        
        The content of `other` is destroyed in the process, so pass this function a 
        copy of `other` if you need it further.
        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            self.u = other.u.copy('F')
            self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError(
                "vector space mismatch: update is using %s features, expected %s"
                % (other.m, self.m))
        logger.info("merging projections: %s + %s" %
                    (str(self.u.shape), str(other.u.shape)))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with ORGQR.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in scipy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :(
        self.u = numpy.asfortranarray(
            self.u)  # does nothing if input already fortran-order array
        other.u = numpy.asfortranarray(other.u)
        gemm = matutils.blas('gemm', self.u)
        logger.debug("constructing orthogonal component")
        c = gemm(1.0, self.u, other.u, trans_a=True)
        gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True)

        other.u = [
            other.u
        ]  # do some reference magic and call qr_destroy, to save RAM
        q, r = matutils.qr_destroy(other.u)  # q, r = QR(component)
        assert not other.u

        # find the rotation that diagonalizes r
        k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s],
                        [
                            matutils.pad(
                                numpy.matrix([]).reshape(0, 0), min(m, n2),
                                n1), r * other.s
                        ]])
        logger.debug("computing SVD of %s dense matrix" % str(k.shape))
        try:
            # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
            # for these early versions of numpy, catch the error and try to compute
            # SVD again, but over k*k^T.
            # see http://www.mail-archive.com/[email protected]/msg07224.html and
            # bug ticket http://projects.scipy.org/numpy/ticket/706
            u_k, s_k, _ = numpy.linalg.svd(
                k, full_matrices=False
            )  # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(
        except numpy.linalg.LinAlgError:
            logging.error("SVD(A) failed; trying SVD(A * A^T)")
            u_k, s_k, _ = numpy.linalg.svd(
                numpy.dot(k, k.T), full_matrices=False
            )  # if this fails too, give up with an exception
            s_k = numpy.sqrt(
                s_k)  # go back from eigen values to singular values

        k = clipSpectrum(s_k**2, self.k)
        u1_k, u2_k, s_k = u_k[:n1, :k].copy('F'), u_k[n1:, :k].copy(
            'F'), s_k[:k]

        # update & rotate current basis U = [U, U']*[U1_k, U2_k]
        logger.debug("updating orthonormal basis U")
        self.u = gemm(
            1.0, self.u, u1_k
        )  # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
        gemm(1.0, q, u2_k, beta=1.0, c=self.u, overwrite_c=True)
        self.s = s_k
    def merge(self, other, decay=1.0):
        """
        Merge this Projection with another. 
        
        Content of `other` is destroyed in the process, so pass this function a 
        copy if you need it further.
        
        This is the optimized merge described in algorithm 5.
        """
        if other.u is None:
            # the other projection is empty => do nothing
            return
        if self.u is None:
            # we are empty => result of merge is the other projection, whatever it is
            self.u = other.u.copy()
            self.s = other.s.copy()
            return
        if self.m != other.m:
            raise ValueError(
                "vector space mismatch: update has %s features, expected %s" %
                (other.m, self.m))
        logger.info("merging projections: %s + %s" %
                    (str(self.u.shape), str(other.u.shape)))
        #        diff = numpy.dot(self.u.T, self.u) - numpy.eye(self.u.shape[1])
        #        logger.info('orth error after=%f' % numpy.sum(diff * diff))
        m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
        # TODO Maybe keep the bases as elementary reflectors, without
        # forming explicit matrices with gorgqr.
        # The only operation we ever need is basis^T*basis ond basis*component.
        # But how to do that in numpy? And is it fast(er)?

        # find component of u2 orthogonal to u1
        # IMPORTANT: keep matrices in suitable order for matrix products; failing to do so gives 8x lower performance :(
        self.u = numpy.asfortranarray(
            self.u)  # does nothing if input already fortran-order array
        other.u = numpy.asfortranarray(other.u)
        gemm, = get_blas_funcs(('gemm', ), (self.u, ))
        logger.debug("constructing orthogonal component")
        c = gemm(1.0, self.u, other.u, trans_a=True)
        gemm(-1.0, self.u, c, beta=1.0, c=other.u, overwrite_c=True)

        # perform q, r = QR(component); code hacked out of scipy.linalg.qr
        logger.debug("computing QR of %s dense matrix" % str(other.u.shape))
        geqrf, = get_lapack_funcs(('geqrf', ), (other.u, ))
        qr, tau, work, info = geqrf(
            other.u, lwork=-1,
            overwrite_a=True)  # sometimes segfaults with overwrite_a=True...
        qr, tau, work, info = geqrf(
            other.u, lwork=work[0],
            overwrite_a=True)  # sometimes segfaults with overwrite_a=True...
        del other.u
        assert info >= 0
        r = triu(qr[:n2, :n2])
        if m < n2:  # rare case...
            qr = qr[:, :m]  # retains fortran order
        gorgqr, = get_lapack_funcs(('orgqr', ), (qr, ))
        q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True)
        q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True)
        assert info >= 0, "qr failed"
        assert q.flags.f_contiguous

        # find rotation that diagonalizes r
        k = numpy.bmat([[
            numpy.diag(decay * self.s), c * other.s
        ], [matutils.pad(numpy.matrix([]).reshape(0, 0), n2, n1),
            r * other.s]])
        logger.debug("computing SVD of %s dense matrix" % str(k.shape))
        u_k, s_k, _ = numpy.linalg.svd(
            k, full_matrices=False
        )  # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(

        k = clipSpectrum(s_k, self.k)
        u_k, s_k = u_k[:, :k], s_k[:k]

        # update & rotate current basis U
        logger.debug("updating orthonormal basis U")
        self.u = gemm(
            1.0, self.u, u_k[:n1]
        )  # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
        gemm(1.0, q, u_k[n1:], beta=1.0, c=self.u,
             overwrite_c=True)  # u = [u,u']*u_k
        self.s = s_k
 def merge(self, other, decay = 1.0):
     """
     Merge this Projection with another. 
     
     Content of `other` is destroyed in the process, so pass this function a 
     copy if you need it further.
     
     This is the optimized merge described in algorithm 5.
     """
     if other.u is None:
         # the other projection is empty => do nothing
         return
     if self.u is None:
         # we are empty => result of merge is the other projection, whatever it is
         if other.s is None:
             # other.u contains a direct document chunk, not svd => perform svd
             docs = other.u
             assert scipy.sparse.issparse(docs)
             if self.m * self.k < 10000:
                 # SVDLIBC gives spurious results for small matrices.. run full
                 # LAPACK on them instead
                 logger.info("computing dense SVD of %s matrix" % str(docs.shape))
                 u, s, vt = numpy.linalg.svd(docs.todense(), full_matrices = False)
             else:
                 try:
                     import sparsesvd
                 except ImportError:
                     raise ImportError("for LSA, the `sparsesvd` module is needed but not found; run `easy_install sparsesvd`")
                 logger.info("computing sparse SVD of %s matrix" % str(docs.shape))
                 ut, s, vt = sparsesvd.sparsesvd(docs, self.k + 30) # ask for a few extra factors, because for some reason SVDLIBC sometimes returns fewer factors than requested
                 u = ut.T
                 del ut
             del vt
             k = clipSpectrum(s, self.k)
             self.u = u[:, :k].copy('F')
             self.s = s[:k]
         else:
             self.u = other.u.copy('F')
             self.s = other.s.copy()
         return
     if self.m != other.m:
         raise ValueError("vector space mismatch: update has %s features, expected %s" %
                          (other.m, self.m))
     logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape)))
     m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
     if other.s is None:
         other.u = other.u.todense()
         other.s = 1.0 # broadcasting will promote this to eye(n2) where needed
     # TODO Maybe keep the bases as elementary reflectors, without 
     # forming explicit matrices with gorgqr.
     # The only operation we ever need is basis^T*basis ond basis*component.
     # But how to do that in numpy? And is it fast(er)?
     
     # find component of u2 orthogonal to u1
     # IMPORTANT: keep matrices in suitable order for matrix products; failing to do so gives 8x lower performance :(
     self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array
     other.u = numpy.asfortranarray(other.u)
     gemm, = get_blas_funcs(('gemm',), (self.u,))
     logger.debug("constructing orthogonal component")
     c = gemm(1.0, self.u, other.u, trans_a = True)
     gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True)
     
     # perform q, r = QR(component); code hacked out of scipy.linalg.qr
     logger.debug("computing QR of %s dense matrix" % str(other.u.shape))
     geqrf, = get_lapack_funcs(('geqrf',), (other.u,))
     qr, tau, work, info = geqrf(other.u, lwork = -1, overwrite_a = True) # sometimes segfaults with overwrite_a=True...?
     qr, tau, work, info = geqrf(other.u, lwork = work[0], overwrite_a = True) # sometimes segfaults with overwrite_a=True...?
     del other.u
     assert info >= 0
     r = triu(qr[:n2, :n2])
     if m < n2: # rare case, #features < #topics
         qr = qr[:, :m] # retains fortran order
     gorgqr, = get_lapack_funcs(('orgqr',), (qr,))
     q, work, info = gorgqr(qr, tau, lwork = -1, overwrite_a = True)
     q, work, info = gorgqr(qr, tau, lwork = work[0], overwrite_a = True)
     assert info >= 0, "qr failed"
     assert q.flags.f_contiguous
     
     # find rotation that diagonalizes r
     k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s]])
     logger.debug("computing SVD of %s dense matrix" % str(k.shape))
     try:
         # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
         # for these early versions of numpy, catch the error and try to compute
         # SVD again, but over k*k^T.
         # see http://www.mail-archive.com/[email protected]/msg07224.html and
         # bug ticket http://projects.scipy.org/numpy/ticket/706
         u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(
     except numpy.linalg.LinAlgError:
         logging.error("SVD(A) failed; trying SVD(A * A^T)")
         u_k, s_k, _ = numpy.linalg.svd(numpy.dot(k, k.T), full_matrices = False) # if this fails too, give up
         s_k = numpy.sqrt(s_k)
     
     k = clipSpectrum(s_k, self.k)
     u_k, s_k = u_k[:, :k], s_k[:k]
     
     # update & rotate current basis U
     logger.debug("updating orthonormal basis U")
     self.u = gemm(1.0, self.u, u_k[:n1]) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
     gemm(1.0, q, u_k[n1:], beta = 1.0, c = self.u, overwrite_c = True) # u = [u,u']*u_k
     self.s = s_k
예제 #13
0
파일: lsimodel.py 프로젝트: andremi/gensim
 def merge(self, other, decay=1.0):
     """
     Merge this Projection with another. 
     
     The content of `other` is destroyed in the process, so pass this function a 
     copy of `other` if you need it further.
     """
     if other.u is None:
         # the other projection is empty => do nothing
         return
     if self.u is None:
         # we are empty => result of merge is the other projection, whatever it is
         self.u = other.u.copy('F')
         self.s = other.s.copy()
         return
     if self.m != other.m:
         raise ValueError("vector space mismatch: update is using %s features, expected %s" %
                          (other.m, self.m))
     logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape)))
     m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1]
     # TODO Maybe keep the bases as elementary reflectors, without 
     # forming explicit matrices with ORGQR.
     # The only operation we ever need is basis^T*basis ond basis*component.
     # But how to do that in scipy? And is it fast(er)?
     
     # find component of u2 orthogonal to u1
     # IMPORTANT: keep matrices in memory suitable order for matrix products; failing to do so gives 8x lower performance :(
     self.u = numpy.asfortranarray(self.u) # does nothing if input already fortran-order array
     other.u = numpy.asfortranarray(other.u)
     gemm = matutils.blas('gemm', self.u)
     logger.debug("constructing orthogonal component")
     c = gemm(1.0, self.u, other.u, trans_a = True)
     gemm(-1.0, self.u, c, beta = 1.0, c = other.u, overwrite_c = True)
     
     other.u = [other.u] # do some reference magic and call qr_destroy, to save RAM
     q, r = matutils.qr_destroy(other.u) # q, r = QR(component)
     assert not other.u
     
     # find the rotation that diagonalizes r
     k = numpy.bmat([[numpy.diag(decay * self.s), c * other.s], [matutils.pad(numpy.matrix([]).reshape(0, 0), min(m, n2), n1), r * other.s]])
     logger.debug("computing SVD of %s dense matrix" % str(k.shape))
     try:
         # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'.
         # for these early versions of numpy, catch the error and try to compute
         # SVD again, but over k*k^T.
         # see http://www.mail-archive.com/[email protected]/msg07224.html and
         # bug ticket http://projects.scipy.org/numpy/ticket/706
         u_k, s_k, _ = numpy.linalg.svd(k, full_matrices = False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :(
     except numpy.linalg.LinAlgError:
         logging.error("SVD(A) failed; trying SVD(A * A^T)")
         u_k, s_k, _ = numpy.linalg.svd(numpy.dot(k, k.T), full_matrices = False) # if this fails too, give up with an exception
         s_k = numpy.sqrt(s_k) # go back from eigen values to singular values
     
     k = clipSpectrum(s_k ** 2, self.k)
     u1_k, u2_k, s_k = u_k[:n1, :k].copy('F'), u_k[n1:, :k].copy('F'), s_k[:k]
     
     # update & rotate current basis U = [U, U']*[U1_k, U2_k]
     logger.debug("updating orthonormal basis U")
     self.u = gemm(1.0, self.u, u1_k) # TODO temporarily creates an extra (m,k) dense array in memory. find a way to avoid this!
     gemm(1.0, q, u2_k, beta = 1.0, c = self.u, overwrite_c = True)
     self.s = s_k
    def svdAddCols(self, docs, decay = 1.0, reorth = False):
        """
        If `X = self.u * self.s * self.v^T` is the current decomposition,
        update it so that `self.u * self.s * self.v^T = [X docs.T]`,
        that is, append new columns to the original matrix.
        
        `docs` is a **dense** matrix containing the new observations as rows.
        """
        keepV = self.v is not None
        if not keepV and reorth:
            raise TypeError("cannot reorthogonalize without the right singular vectors (v must not be None)")
        a = numpy.asmatrix(numpy.asarray(docs)).T
        m, k = self.u.shape
        if keepV:
            n, k2 = self.v.shape
            assert k == k2, "left/right singular vectors shape mismatch!"
        m2, c = a.shape
        assert m == m2, "new documents must be in the same term-space as the original documents (old %s, new %s)" % (self.u.shape, a.shape)
        
        # construct orthogonal basis for (I - U * U^T) * A
        logging.debug("constructing orthogonal component")
        m = self.u.T * a # project documents into eigenspace; (k, m) * (m, c) = (k, c)
        logging.debug("computing orthogonal basis")
        P, Ra = numpy.linalg.qr(a - self.u * m) # equation (2)

        # allow re-orientation towards new data trends in the document stream, by giving less emphasis on old values
        self.s *= decay
        
        # now we're ready to construct K; K will be mostly diagonal and sparse, with
        # lots of structure, and of shape only (k + c, k + c), so its direct SVD 
        # ought to be fast for reasonably small additions of new documents (ie. tens 
        # or hundreds of new documents at a time).
        empty = matutils.pad(numpy.matrix([]).reshape(0, 0), c, k)
        K = numpy.bmat([[self.s, m], [empty, Ra]]) # (k + c, k + c), equation (4)
        logging.debug("computing %s SVD" % str(K.shape))
        uK, sK, vK = numpy.linalg.svd(K, full_matrices = False) # there is no python linalg wrapper for partial svd => request all k + c factors :(
        lost = 1.0 - numpy.sum(sK[: k]) / numpy.sum(sK)
        logging.debug("discarding %.1f%% of data variation" % (100 * lost))
        
        # clip full decomposition to the requested rank
        uK = numpy.matrix(uK[:, :k])
        sK = numpy.matrix(numpy.diag(sK[: k]))
        vK = numpy.matrix(vK.T[:, :k]) # .T because numpy transposes the right vectors V, so we need to transpose it back: V.T.T = V
        
        # and finally update the left/right singular vectors
        logging.debug('rotating subspaces')
        self.s = sK
        
        # update U piece by piece, to avoid creating (huge) temporary arrays in a complex expression and running out of memory
        P = P * uK[k:]
        self.u = self.u * uK[:k]
        self.u += P # (m, k) * (k, k) + (m, c) * (c, k) = (m, k), equation (5)
        del P # free up memory
        
        if keepV:
            self.v = self.v * vK[:k, :] # (n + c, k) * (k, k) = (n + c, k)
            rot = vK[k:, :]
            self.v = numpy.bmat([[self.v], [rot]])
            
            if reorth:
                logging.debug("re-orthogonalizing the decomposition")
                uQ, uR = numpy.linalg.qr(self.u)
                vQ, vR = numpy.linalg.qr(self.v)
                uK, sK, vK = numpy.linalg.svd(uR * self.s * vR.T, full_matrices = False)
                uK = numpy.matrix(uK[:, :k])
                sK = numpy.matrix(numpy.diag(sK[: k]))
                vK = numpy.matrix(vK.T[:, :k])
                
                logging.debug("adjusting singular values by %f%%" % 
                              (100.0 * numpy.sum(numpy.abs(self.s - sK)) / numpy.sum(numpy.abs(self.s))))
                self.u = uQ * uK
                self.s = sK
                self.v = vQ * vK
        logging.debug("added %i documents" % len(docs))