def initialize(self, corpus, chunks = 100, keepDecomposition = False):
        """
        Run SVD decomposition on the corpus. This will define the latent space into 
        which terms and documents will be mapped.
        
        The SVD is created incrementally, in blocks of `chunks` documents. In the
        end, a `self.projection` matrix is constructed that can be used to transform 
        documents into the latent space. The `U, S, V` decomposition itself is 
        discarded, unless `keepDecomposition` is True, in which case it is stored 
        in `self.u`, `self.s` and `self.v`.
        
        The algorithm is adapted from:
        **M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition**
        """
        if self.id2word is None:
            logging.info("no word id mapping provided; initializing from corpus, assuming identity")
            maxId = -1
            for document in corpus:
                maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document]))
            self.numTerms = 1 + maxId
            self.id2word = dict(zip(xrange(self.numTerms), xrange(self.numTerms)))
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        
        # initialize decomposition (zero documents so far)
        self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics))) # leave default numeric type (=double)
        self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)))
        #self.v = numpy.matrix(numpy.zeros((0, self.numTopics)))
        self.v = None
        
        # do the actual work -- perform iterative singular value decomposition
        # this is done by sequentially updating SVD with `chunks` new documents
        chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks)
        for chunkNo, (key, group) in enumerate(chunker):
            # convert the chunk of documents to vectors
            docs = [matutils.doc2vec(doc, self.numTerms) for docNo, doc in group]
#            self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents
            self.svdAddCols(docs, reorth = False)
            logging.info("processed documents up to #%s" % docNo)
        

        # calculate projection needed to get document-topic matrix from term-document matrix.
        #
        # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x,
        # so the projection is self.s^-1 * self.u^-1.
        #
        # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so
        # we pre-multiply v * s (ie., scale axes by singular values), and return
        # that directly as the representation of `x` in LSI space.
        #
        # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is 
        # just self.u.T
        # 
        # note that neither `v` (the right singular vectors) nor `s` (the singular 
        # values) are used at all in the transformation
        self.projection = self.u.T
        
        if not keepDecomposition:
            # once we have the projection stored in self, discard u*s*v decomposition to free up memory
            del self.u, self.v
예제 #2
0
    def initialize(self, corpus, chunks = 100, keepDecomposition = False):
        """
        Run SVD decomposition on the corpus. This will define the latent space into 
        which terms and documents will be mapped.
        
        The SVD is created incrementally, in blocks of `chunks` documents. In the
        end, a `self.projection` matrix is constructed that can be used to transform 
        documents into the latent space. The `U, S, V` decomposition itself is 
        discarded, unless keepDecomposition is True, in which case it is stored 
        in `self.u`, `self.s` and `self.v`.
        
        The algorithm is adapted from:
        M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition
        """
        if self.id2word is None:
            logging.info("no word id mapping provided; initializing from corpus, assuming identity")
            maxId = -1
            for document in corpus:
                maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document]))
            self.numTerms = 1 + maxId
            self.id2word = dict(zip(xrange(self.numTerms), xrange(self.numTerms)))
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        
        # initialize decomposition (zero documents so far)
        self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics))) # leave default numeric type (=double)
        self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)))
        #self.v = numpy.matrix(numpy.zeros((0, self.numTopics)))
        self.v = None
        
        # do the actual work -- perform iterative singular value decomposition
        # this is done by sequentially updating SVD with `chunks` new documents
        chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks)
        for chunkNo, (key, group) in enumerate(chunker):
            # convert the chunk of documents to vectors
            docs = [matutils.doc2vec(doc, self.numTerms) for docNo, doc in group]
#            self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents
            self.svdAddCols(docs, reorth = False)
            logging.info("processed documents up to #%s" % docNo)
        

        # calculate projection needed to get document-topic matrix from term-document matrix.
        #
        # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x,
        # so the projection is self.s^-1 * self.u^-1.
        #
        # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so
        # we pre-multiply v * s (ie., scale axes by singular values), and return
        # that directly as the representation of `x` in LSI space.
        #
        # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is 
        # just self.u.T
        # 
        # note that neither `v` (the right singular vectors) nor `s` (the singular 
        # values) are used at all in the transformation
        self.projection = self.u.T
        
        if not keepDecomposition:
            # once we have the projection stored in self, discard u*s*v decomposition to free up memory
            del self.u, self.v
 def __getitem__(self, bow):
     """
     Return topic distribution, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space.
     """
     vec = matutils.doc2vec(bow, self.numTerms)
     vec.shape = (self.numTerms, 1)
     topicDist = self.projection * vec
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
예제 #4
0
 def __getitem__(self, bow):
     """
     Return topic distribution, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space.
     """
     vec = matutils.doc2vec(bow, self.numTerms)
     vec.shape = (self.numTerms, 1)
     topicDist = self.projection * vec
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
예제 #5
0
 def testTransform(self):
     # create the transformation model
     model = ldamodel.LdaModel(self.corpus, numTopics = 2)
     
     # transform one document
     doc = list(self.corpus)[0]
     transformed = model[doc]
     
     vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests
     expected = [0.0, 1.0]
     self.assertTrue(numpy.allclose(sorted(vec), sorted(expected))) # must contain the same values, up to re-ordering
예제 #6
0
 def testTransform(self):
     # create the transformation model
     model = lsimodel.LsiModel(self.corpus, numTopics = 2)
     
     # transform one document
     doc = list(self.corpus)[0]
     transformed = model[doc]
     
     vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests
     expected = [0.1973928, 0.05591352]
     self.assertTrue(numpy.allclose(abs(vec), expected)) # transformed entries must be equal up to sign
    def initialize(self, corpus, chunks=100, keepDecomposition=False):
        """
        Run SVD decomposition on the corpus. This will define the latent space into 
        which terms and documents will be mapped.
        
        The SVD is created incrementally, in blocks of `chunks` documents.
        
        The algorithm is adapted from:
        M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition
        """
        if self.id2word is None:
            logging.info(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            maxId = -1
            for document in corpus:
                maxId = max(maxId,
                            max([-1] + [fieldId for fieldId, _ in document]))
            self.numTerms = 1 + maxId
            self.id2word = dict(
                zip(xrange(self.numTerms), xrange(self.numTerms)))
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        # initialize decomposition (zero documents so far)
        self.u = numpy.matrix(numpy.zeros(
            (self.numTerms,
             self.numTopics)))  # leave default numeric type (=double)
        self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)))
        #self.v = numpy.matrix(numpy.zeros((0, self.numTopics)))
        self.v = None

        # do the actual work -- perform iterative singular value decomposition
        # this is done by sequentially updating SVD with `chunks` new documents
        chunker = itertools.groupby(enumerate(corpus),
                                    key=lambda val: val[0] / chunks)
        for chunkNo, (key, group) in enumerate(chunker):
            # convert the chunk of documents to vectors
            docs = [
                matutils.doc2vec(doc, self.numTerms) for docNo, doc in group
            ]
            #            self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents
            self.svdAddCols(docs, reorth=False)
            logging.info("processed documents up to #%s" % docNo)

        # calculate projection needed to get document-topic matrix from term-document matrix.
        # note that v (topics of the training corpus) are not used at all for the transformation
        invS = numpy.diag(numpy.diag(1.0 / self.s))
        self.projection = numpy.dot(
            invS, self.u.T)  # s^-1 * u^-1; (k, k) * (k, m) = (k, m)
        if keepDecomposition:
            # once we have the projection stored in self, discard u*s*v decomposition to free up memory
            del self.u, self.s, self.v
예제 #8
0
 def testTransform(self):
     # create the transformation model
     model = lsimodel.LsiModel(self.corpus, numTopics = 2)
     
     # transform one document
     doc = list(self.corpus)[0]
     transformed = model[doc]
     vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests
     
     expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
     # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
     
     self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
    def initialize(self, corpus, chunks = 100, keepDecomposition = False):
        """
        Run SVD decomposition on the corpus. This will define the latent space into 
        which terms and documents will be mapped.
        
        The SVD is created incrementally, in blocks of `chunks` documents.
        
        The algorithm is adapted from:
        M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition
        """
        if self.id2word is None:
            logging.info("no word id mapping provided; initializing from corpus, assuming identity")
            maxId = -1
            for document in corpus:
                maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document]))
            self.numTerms = 1 + maxId
            self.id2word = dict(zip(xrange(self.numTerms), xrange(self.numTerms)))
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        
        # initialize decomposition (zero documents so far)
        self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics))) # leave default numeric type (=double)
        self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)))
        #self.v = numpy.matrix(numpy.zeros((0, self.numTopics)))
        self.v = None
        
        # do the actual work -- perform iterative singular value decomposition
        # this is done by sequentially updating SVD with `chunks` new documents
        chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks)
        for chunkNo, (key, group) in enumerate(chunker):
            # convert the chunk of documents to vectors
            docs = [matutils.doc2vec(doc, self.numTerms) for docNo, doc in group]
#            self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents
            self.svdAddCols(docs, reorth = False)
            logging.info("processed documents up to #%s" % docNo)
        

        # calculate projection needed to get document-topic matrix from term-document matrix.
        # note that v (topics of the training corpus) are not used at all for the transformation
        invS = numpy.diag(numpy.diag(1.0 / self.s))
        self.projection = numpy.dot(invS, self.u.T) # s^-1 * u^-1; (k, k) * (k, m) = (k, m)
        if keepDecomposition:
            # once we have the projection stored in self, discard u*s*v decomposition to free up memory
            del self.u, self.s, self.v
 def __getitem__(self, bow, scaled = True):
     """
     Return latent distribution, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     vec = matutils.doc2vec(bow, self.numTerms)
     vec.shape = (self.numTerms, 1)
     topicDist = self.projection * vec
     if not scaled:
         topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
예제 #11
0
 def __getitem__(self, bow, scaled = True):
     """
     Return latent distribution, as a list of (topic_id, topic_value) 2-tuples.
     
     This is done by folding input document into the latent topic space. 
     
     Note that this function returns the latent space representation **scaled by the
     singular values**. To return non-scaled embedding, set `scaled` to False.
     """
     # if the input vector is in fact a corpus, return a transformed corpus as result
     if utils.isCorpus(bow):
         return self._apply(bow)
     
     vec = matutils.doc2vec(bow, self.numTerms)
     vec.shape = (self.numTerms, 1)
     topicDist = self.projection * vec
     if not scaled:
         topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist
     return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist)
             if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]