def saveCorpus(fname, corpus, id2word = None):
     """
     Save a corpus in the List-of-words format.
     """
     if id2word is None:
         logging.info("no word id mapping provided; initializing from corpus")
         id2word = utils.dictFromCorpus(corpus)
     
     logging.info("storing corpus in List-Of-Words format: %s" % fname)
     truncated = 0
     fout = open(fname, 'w')
     fout.write('%i\n' % len(corpus))
     for doc in corpus:
         words = []
         for wordId, value in doc:
             if abs(int(value) - value) > 1e-6:
                 truncated += 1
             words.extend([str(id2word[wordId])] * int(value))
         fout.write('%s\n' % ' '.join(words))
     fout.close()
     
     if truncated:
         logging.warning("List-of-words format can only save vectors with \
         integer entries; %i float entries were truncated to integer value" % 
         truncated)
예제 #2
0
    def saveCorpus(fname, corpus, id2word=None):
        """
        Save a corpus in the LDA-C format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.
        
        This function is automatically called by `BleiCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logging.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dictFromCorpus(corpus)
            numTerms = len(id2word)
        else:
            numTerms = 1 + max([-1] + id2word.keys())

        logging.info("storing corpus in Blei's LDA-C format: %s" % fname)
        with open(fname, 'w') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                fout.write("%i %s\n" % (len(doc), ' '.join("%i:%s" % p for p in doc)))
            fout.close()

            # write out vocabulary, in a format compatible with Blei's topics.py script
            fnameVocab = fname + '.vocab'
            logging.info("saving vocabulary of %i words to %s" % (numTerms, fnameVocab))
            fout = open(fnameVocab, 'w')
            for featureId in xrange(numTerms):
                fout.write("%s\n" % utils.toUtf8(id2word.get(featureId, '---')))

        return offsets
예제 #3
0
    def saveCorpus(fname, corpus, id2word=None):
        """
        Save a corpus in the List-of-words format.
        
        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logging.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dictFromCorpus(corpus)

        logging.info("storing corpus in List-Of-Words format: %s" % fname)
        truncated = 0
        offsets = []
        with open(fname, 'w') as fout:
            fout.write('%i\n' % len(corpus))
            for doc in corpus:
                words = []
                for wordId, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([str(id2word[wordId])] * int(value))
                offsets.append(fout.tell())
                fout.write('%s\n' % ' '.join(words))

        if truncated:
            logging.warning(
                "List-of-words format can only save vectors with "
                "integer elements; %i float entries were truncated to integer value"
                % truncated)
        return offsets
예제 #4
0
 def saveCorpus(fname, corpus, id2word = None):
     """
     Save a corpus in the Matrix Market format.
     
     There are actually two files saved: fname and fname.vocab, where
     fname.vocab is the vocabulary file.
     """
     if id2word is None:
         logging.info("no word id mapping provided; initializing from corpus")
         id2word = utils.dictFromCorpus(corpus)
         numTerms = len(id2word)
     else:
         numTerms = 1 + max([-1] + id2word.keys())
     
     logging.info("storing corpus in Blei's LDA-C format: %s" % fname)
     fout = open(fname, 'w')
     for doc in corpus:
         fout.write("%i %s\n" % (len(doc), ' '.join("%i:%f" % p for p in doc)))
     fout.close()
     
     # write out vocabulary, in a format compatible with Blei's topics.py script
     fnameVocab = fname + '.vocab'
     logging.info("saving vocabulary of %i words to %s" % (numTerms, fnameVocab))
     fout = open(fnameVocab, 'w')
     for featureId in xrange(numTerms):
         fout.write("%s\n" % id2word.get(featureId, '---'))
     fout.close()
예제 #5
0
 def saveCorpus(fname, corpus, id2word = None):
     """
     Save a corpus in the Matrix Market format.
     
     There are actually two files saved: `fname` and `fname.vocab`, where
     `fname.vocab` is the vocabulary file.
     """
     if id2word is None:
         logging.info("no word id mapping provided; initializing from corpus")
         id2word = utils.dictFromCorpus(corpus)
         numTerms = len(id2word)
     else:
         numTerms = 1 + max([-1] + id2word.keys())
     
     logging.info("storing corpus in Blei's LDA-C format: %s" % fname)
     fout = open(fname, 'w')
     for doc in corpus:
         doc = list(doc)
         fout.write("%i %s\n" % (len(doc), ' '.join("%i:%s" % p for p in doc)))
     fout.close()
     
     # write out vocabulary, in a format compatible with Blei's topics.py script
     fnameVocab = fname + '.vocab'
     logging.info("saving vocabulary of %i words to %s" % (numTerms, fnameVocab))
     fout = open(fnameVocab, 'w')
     for featureId in xrange(numTerms):
         fout.write("%s\n" % utils.toUtf8(id2word.get(featureId, '---')))
     fout.close()
 def initialize(self, corpus):
     """
     Compute inverse document weights, which will be used to modify term 
     frequencies for documents.
     """
     if self.id2word is None:
         logging.info("no word id mapping provided; initializing from corpus, assuming identity")
         self.id2word = utils.dictFromCorpus(corpus)
         self.numTerms = len(self.id2word)
     else:
         self.numTerms = 1 + max([-1] + self.id2word.keys())
     
     logging.info("calculating IDF weights over %i documents" % len(corpus))
     idfs = {}
     numNnz = 0
     for docNo, bow in enumerate(corpus):
         if docNo % 5000 == 0:
             logging.info("PROGRESS: processing document %i/%i" % 
                          (docNo, len(corpus)))
         numNnz += len(bow)
         for termId, termCount in bow:
             idfs[termId] = idfs.get(termId, 0) + 1
     idfs = dict((termId, math.log(1.0 * (docNo + 1) / docFreq, 2)) # the IDF weight formula 
                 for termId, docFreq in idfs.iteritems())
     
     self.idfs = idfs
     
     # keep some stats about the training corpus
     self.numDocs = len(corpus)
     self.numNnz = numNnz
    def initialize(self, corpus, chunks = 100, keepDecomposition = False, dtype = numpy.float64):
        """
        Run SVD decomposition on the corpus. This will define the latent space into 
        which terms and documents will be mapped.
        
        The SVD is created incrementally, in blocks of `chunks` documents. In the
        end, a `self.projection` matrix is constructed that can be used to transform 
        documents into the latent space. The `U, S, V` decomposition itself is 
        discarded, unless `keepDecomposition` is True, in which case it is stored 
        in `self.u`, `self.s` and `self.v`.
        
        `dtype` dictates precision used for intermediate computations; the final 
        projection will however always be of type numpy.float32.
        
        The algorithm is adapted from:
        **M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition**
        """
        if self.id2word is None:
            logging.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        
        # initialize decomposition (zero documents so far)
        self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics)), dtype = dtype)
        self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)), dtype = dtype)
        #self.v = numpy.matrix(numpy.zeros((0, self.numTopics)), dtype = dtype)
        self.v = None
        
        # do the actual work -- perform iterative singular value decomposition.
        # this is done by sequentially updating SVD with `chunks` new documents
        chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks)
        for chunkNo, (key, group) in enumerate(chunker):
            # convert the chunk of sparse documents to full vectors
            docs = [matutils.sparse2full(doc, self.numTerms) for docNo, doc in group]
#            self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents
            self.svdAddCols(docs, reorth = False)
            logging.info("processed documents up to #%s" % docNo)
        

        # calculate projection needed to get document-topic matrix from term-document matrix.
        #
        # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x,
        # so the projection is self.s^-1 * self.u^-1.
        #
        # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so
        # we pre-multiply v * s (ie., scale axes by singular values), and return
        # that directly as the representation of `x` in LSI space.
        #
        # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is 
        # just self.u.T
        # 
        # note that neither `v` (the right singular vectors) nor `s` (the singular 
        # values) are used at all in the transformation
        self.projection = self.u.T.astype(numpy.float32).copy('C')
        
        if not keepDecomposition:
            # once we have the projection stored in self, discard u*s*v decomposition to free up memory
            del self.u, self.v
예제 #8
0
    def saveCorpus(fname, corpus, id2word=None):
        """
        Save a corpus in the List-of-words format.
        
        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logging.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dictFromCorpus(corpus)

        logging.info("storing corpus in List-Of-Words format: %s" % fname)
        truncated = 0
        offsets = []
        with open(fname, "w") as fout:
            fout.write("%i\n" % len(corpus))
            for doc in corpus:
                words = []
                for wordId, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([str(id2word[wordId])] * int(value))
                offsets.append(fout.tell())
                fout.write("%s\n" % " ".join(words))

        if truncated:
            logging.warning(
                "List-of-words format can only save vectors with "
                "integer elements; %i float entries were truncated to integer value" % truncated
            )
        return offsets
    def initialize(self, corpus, chunks=100, keepDecomposition=False, dtype=numpy.float64):
        """
        Run SVD decomposition on the corpus. This will define the latent space into 
        which terms and documents will be mapped.
        
        The SVD is created incrementally, in blocks of `chunks` documents. In the
        end, a `self.projection` matrix is constructed that can be used to transform 
        documents into the latent space. The `U, S, V` decomposition itself is 
        discarded, unless `keepDecomposition` is True, in which case it is stored 
        in `self.u`, `self.s` and `self.v`.
        
        `dtype` dictates precision used for intermediate computations; the final 
        projection will however always be of type numpy.float32.
        
        The algorithm is adapted from:
        **M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition**
        """
        if self.id2word is None:
            logging.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        # initialize decomposition (zero documents so far)
        self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics)), dtype=dtype)
        self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)), dtype=dtype)
        # self.v = numpy.matrix(numpy.zeros((0, self.numTopics)), dtype = dtype)
        self.v = None

        # do the actual work -- perform iterative singular value decomposition.
        # this is done by sequentially updating SVD with `chunks` new documents
        chunker = itertools.groupby(enumerate(corpus), key=lambda val: val[0] / chunks)
        for chunkNo, (key, group) in enumerate(chunker):
            # convert the chunk of sparse documents to full vectors
            docs = [matutils.sparse2full(doc, self.numTerms) for docNo, doc in group]
            #            self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents
            self.svdAddCols(docs, reorth=False)
            logging.info("processed documents up to #%s" % docNo)

        # calculate projection needed to get document-topic matrix from term-document matrix.
        #
        # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x,
        # so the projection is self.s^-1 * self.u^-1.
        #
        # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so
        # we pre-multiply v * s (ie., scale axes by singular values), and return
        # that directly as the representation of `x` in LSI space.
        #
        # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is
        # just self.u.T
        #
        # note that neither `v` (the right singular vectors) nor `s` (the singular
        # values) are used at all in the transformation
        self.projection = self.u.T.astype(numpy.float32).copy("C")

        if not keepDecomposition:
            # once we have the projection stored in self, discard u*s*v decomposition to free up memory
            del self.u, self.v
    def __init__(self, corpus = None, id2word = None, numTopics = 200, extraDims = 10, 
                 chunks = 100, dtype = numpy.float64):
        """
        `numTopics` is the number of requested factors (latent dimensions). 
        
        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary 
        notation. You can also add new training documents, with ``self.addDocuments``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        `extraDims` is the number of extra dimensions that will be internally 
        computed (ie. `numTopics + extraDims`) to improve numerical properties of 
        the SVD algorithm. These extra dimensions will be eventually chopped off
        for the final projection. Set to 0 to save memory; set to ~10 to
        2*numTopics for increased SVD precision.
        
        If you specify a `corpus`, it will be used to train the model. See the 
        method `addDocuments` for a description of the `chunks` and `decay` parameters.
        
        The algorithm is based on
        **Brand, 2006: Fast low-rank modifications of the thin singular value decomposition**.
    
        Example:
        
        >>> lsi = LsiModel(corpus, numTopics = 10)
        >>> print lsi[doc_tfidf]
        >>> lsi.addDocuments(corpus2) # update LSI on additional documents
        >>> print lsi[doc_tfidf]
        
        """
        self.id2word = id2word
        self.numTopics = numTopics # number of latent topics
        self.extraDims = extraDims
        self.dtype = dtype
        
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')
        
        if self.id2word is None:
            logging.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        
        self.projection = numpy.asmatrix(numpy.zeros((self.numTopics, self.numTerms), dtype = dtype))
        self.u = None
        self.s = numpy.asmatrix(numpy.zeros((self.numTopics + self.extraDims, self.numTopics + self.extraDims)), dtype = dtype)
        self.v = None

        if corpus is not None:
            self.addDocuments(corpus, chunks = chunks, updateProjection = True)
 def initialize(self, corpus):
     """
     Initialize the random projection matrix.
     """
     if self.id2word is None:
         logging.info("no word id mapping provided; initializing from corpus, assuming identity")
         self.id2word = utils.dictFromCorpus(corpus)
         self.numTerms = len(self.id2word)
     else:
         self.numTerms = 1 + max([-1] + self.id2word.keys())
         
     # Now construct the projection matrix itself.
     # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
     # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
     randmat = 1 - 2 * numpy.random.binomial(1, 0.5, (self.numTopics, self.numTerms)) # convert from 0/1 to +1/-1
     self.projection = numpy.asmatrix(randmat, dtype = numpy.float32) # convert from int32 to floats, for faster multiplications
 def initialize(self, corpus):
     """
     Initialize the random projection matrix.
     """
     if self.id2word is None:
         logging.info("no word id mapping provided; initializing from corpus, assuming identity")
         self.id2word = utils.dictFromCorpus(corpus)
         self.numTerms = len(self.id2word)
     else:
         self.numTerms = 1 + max([-1] + self.id2word.keys())
         
     # Now construct the projection matrix itself.
     #
     # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
     # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
     tmp = numpy.random.binomial(1, 0.5, (self.numTopics, self.numTerms)) # FIXME temporary array unnecessarily big (int32 -> int8)
     self.projection = numpy.asmatrix(1 - 2 * tmp.astype(numpy.int8)) # convert from 0/1 to +1/-1
예제 #13
0
    def initialize(self, corpus):
        """
        Initialize the random projection matrix.
        """
        if self.id2word is None:
            logging.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        shape = self.numTopics, self.numTerms
        logger.info("constructing %s random matrix" % str(shape))
        # Now construct the projection matrix itself.
        # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
        # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
        randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1
        self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # convert from int32 to floats, for faster multiplications
예제 #14
0
    def __init__(self, corpus, id2word = None, numTopics = 200, alpha = None, initMode = 'random'):
        """
        Initialize the model based on corpus.
        
        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic 
        printing.
        
        `numTopics` is the number of requested topics.
        
        `alpha` is either None (to be estimated during training) or a number 
        between (0.0, 1.0).
        """
        # store user-supplied parameters
        self.id2word = id2word

        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        self.numTopics = numTopics # number of latent topics
        
        # internal constants; can be manually changed after having called this constructor and before calling `initialize()`
        self.ESTIMATE_ALPHA = alpha is None
        if alpha is None: # no alpha supplied by user => get some initial estimate
            alpha = 10.0 / numTopics # initial estimate is 50 / numTopics, as suggested in Steyvers&Griffiths: Probabilistic Topic Models
        self.alpha = min(0.99999, max(0.00001, alpha)) # dirichlet prior; make sure it's within bounds

        # set EM training constants
        self.EM_MAX_ITER = 50 # maximum number of EM iterations; usually converges much earlier
        self.EM_CONVERGED = 0.0001 # relative difference between two iterations; if lower than this, stop the EM training 
        self.VAR_MAX_ITER = 20 # maximum number of document inference iterations
        self.VAR_CONVERGED = 0.000001 # relative difference between document inference iterations needed to stop sooner than VAR_MAX_ITER
        
        if corpus is not None:
            self.initialize(corpus, initMode)
예제 #15
0
    def saveCorpus(fname, corpus, id2word=None):
        """
        Save a corpus in the LDA-C format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.
        
        This function is automatically called by `BleiCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logging.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dictFromCorpus(corpus)
            numTerms = len(id2word)
        else:
            numTerms = 1 + max([-1] + id2word.keys())

        logging.info("storing corpus in Blei's LDA-C format: %s" % fname)
        with open(fname, 'w') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                fout.write("%i %s\n" % (len(doc), ' '.join("%i:%s" % p
                                                           for p in doc)))
            fout.close()

            # write out vocabulary, in a format compatible with Blei's topics.py script
            fnameVocab = fname + '.vocab'
            logging.info("saving vocabulary of %i words to %s" %
                         (numTerms, fnameVocab))
            fout = open(fnameVocab, 'w')
            for featureId in xrange(numTerms):
                fout.write("%s\n" %
                           utils.toUtf8(id2word.get(featureId, '---')))

        return offsets
    def initialize(self, corpus):
        """
        Initialize the random projection matrix.
        """
        if self.id2word is None:
            logging.info(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        # Now construct the projection matrix itself.
        #
        # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
        # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
        tmp = numpy.random.binomial(
            1, 0.5,
            (self.numTopics, self.numTerms
             ))  # FIXME temporary array unnecessarily big (int32 -> int8)
        self.projection = numpy.asmatrix(
            1 - 2 * tmp.astype(numpy.int8))  # convert from 0/1 to +1/-1
예제 #17
0
    def __init__(self,
                 corpus=None,
                 numTopics=200,
                 id2word=None,
                 chunks=20000,
                 decay=1.0,
                 distributed=False,
                 onepass=False):
        """
        `numTopics` is the number of requested factors (latent dimensions). 
        
        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary 
        notation. You can also add new training documents, with ``self.addDocuments``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        If you specify a `corpus`, it will be used to train the model. See the 
        method `addDocuments` for a description of the `chunks` and `decay` parameters.
        
        If your document stream is one-pass only (the stream cannot be repeated),
        turn on `onepass` to force a single pass SVD algorithm (slower).

        Turn on `distributed` to enforce distributed computing (only makes sense 
        if `onepass` is set at the same time, too).
        
        Example:
        
        >>> lsi = LsiModel(corpus, numTopics=10)
        >>> print lsi[doc_tfidf] # project some document into LSI space
        >>> lsi.addDocuments(corpus2) # update LSI on additional documents
        >>> print lsi[doc_tfidf]
        
        """
        self.id2word = id2word
        self.numTopics = int(numTopics)
        self.chunks = int(chunks)
        self.decay = float(decay)
        if distributed:
            if not onepass:
                logger.warning(
                    "forcing the one-pass algorithm for distributed LSA")
                onepass = True
        self.onepass = onepass

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        self.docs_processed = 0
        self.projection = Projection(self.numTerms, self.numTopics)

        if not distributed:
            logger.info("using serial LSI version on this node")
            self.dispatcher = None
        else:
            if not onepass:
                raise NotImplementedError(
                    "distributed stochastic LSA not implemented yet; "
                    "run either distributed one-pass, or serial randomized.")
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy('PYRONAME:gensim.dispatcher@%s' %
                                             ns._pyroUri.location)
                logger.debug("looking for dispatcher at %s" %
                             str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word,
                                      numTopics=numTopics,
                                      chunks=chunks,
                                      decay=decay,
                                      distributed=False,
                                      onepass=onepass)
                self.dispatcher = dispatcher
                logger.info("using distributed version with %i workers" %
                            len(dispatcher.getworkers()))
            except Exception, err:
                # distributed version was specifically requested, so this is an error state
                logger.error("failed to initialize distributed LSI (%s)" % err)
                raise RuntimeError(
                    "failed to initialize distributed LSI (%s)" % err)
예제 #18
0
    def __init__(self, corpus=None, numTopics=100, id2word=None, distributed=False, 
                 chunks=10000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5):
        """
        `numTopics` is the number of requested latent topics to be extracted from
        the training corpus. 
        
        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic 
        printing.
        
        `alpha` and `eta` are hyperparameters on document-topic (theta) and 
        topic-word (lambda) distributions. Both default to a symmetric 1.0/numTopics 
        (but can be set to a vector, for assymetric priors).
        
        Turn on `distributed` to force distributed computing (see the web tutorial
        on how to set up a cluster of machines for gensim).
        
        Example:
        
        >>> lda = LdaModel(corpus, numTopics=100)
        >>> print lda[doc_bow] # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print lda[doc_bow]
        
        """
        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')
        
        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        if self.numTerms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")
        
        self.distributed = bool(distributed)
        self.numTopics = int(numTopics)
        self.chunks = chunks
        self.decay = decay
        self.num_updates = 0
        
        self.passes = passes
        self.update_every = update_every
        
        if alpha is None:
            self.alpha = 1.0 / numTopics
        else:
            self.alpha = alpha
        if eta is None:
            self.eta = 1.0 / numTopics
        else:
            self.eta = eta
        
        # VB constants
        self.VAR_MAXITER = 50
        self.VAR_THRESH = 0.001

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            # set up distributed version
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy('PYRONAME:gensim.lda_dispatcher@%s' % ns._pyroUri.location)
                dispatcher._pyroOneway.add("exit")
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=id2word, numTopics=numTopics,
                                      chunks=chunks, alpha=alpha, eta=eta, distributed=False)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers" % self.numworkers)
            except Exception, err:
                logger.error("failed to initialize distributed LDA (%s)" % err)
                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
    def __init__(self,
                 corpus=None,
                 id2word=None,
                 numTopics=200,
                 extraDims=10,
                 chunks=100,
                 dtype=numpy.float64):
        """
        `numTopics` is the number of requested factors (latent dimensions). 
        
        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary 
        notation. You can also add new training documents, with ``self.addDocuments``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        `extraDims` is the number of extra dimensions that will be internally 
        computed (ie. `numTopics + extraDims`) to improve numerical properties of 
        the SVD algorithm. These extra dimensions will be eventually chopped off
        for the final projection. Set to 0 to save memory; set to ~10 to
        2*numTopics for increased SVD precision.
        
        If you specify a `corpus`, it will be used to train the model. See the 
        method `addDocuments` for a description of the `chunks` and `decay` parameters.
        
        The algorithm is based on
        **Brand, 2006: Fast low-rank modifications of the thin singular value decomposition**.
    
        Example:
        
        >>> lsi = LsiModel(corpus, numTopics = 10)
        >>> print lsi[doc_tfidf]
        >>> lsi.addDocuments(corpus2) # update LSI on additional documents
        >>> print lsi[doc_tfidf]
        
        """
        self.id2word = id2word
        self.numTopics = numTopics  # number of latent topics
        self.extraDims = extraDims
        self.dtype = dtype

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logging.info(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        self.projection = numpy.asmatrix(
            numpy.zeros((self.numTopics, self.numTerms), dtype=dtype))
        self.u = None
        self.s = numpy.asmatrix(numpy.zeros((self.numTopics + self.extraDims,
                                             self.numTopics + self.extraDims)),
                                dtype=dtype)
        self.v = None

        if corpus is not None:
            self.addDocuments(corpus, chunks=chunks, updateProjection=True)
    def __init__(self, corpus=None, numTopics=200, id2word=None, chunks=20000, 
                 decay=1.0, distributed=False, onepass=False):
        """
        `numTopics` is the number of requested factors (latent dimensions). 
        
        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary 
        notation. You can also add new training documents, with ``self.addDocuments``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        If you specify a `corpus`, it will be used to train the model. See the 
        method `addDocuments` for a description of the `chunks` and `decay` parameters.
        
        If your document stream is one-pass only (the stream cannot be repeated),
        turn on `onepass` to force a single pass SVD algorithm (slower).

        Turn on `distributed` to force distributed computing.
        
        Example:
        
        >>> lsi = LsiModel(corpus, numTopics=10)
        >>> print lsi[doc_tfidf] # project some document into LSI space
        >>> lsi.addDocuments(corpus2) # update LSI on additional documents
        >>> print lsi[doc_tfidf]
        
        """
        self.id2word = id2word
        self.numTopics = int(numTopics)
        self.chunks = int(chunks)
        self.decay = float(decay)
        if distributed:
            if not onepass:
                logger.warning("forcing the one-pass algorithm for distributed LSA")
                onepass = True
        self.onepass = onepass
        
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')
        
        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        
        self.docs_processed = 0
        self.projection = Projection(self.numTerms, self.numTopics)
        
        if not distributed:
            logger.info("using serial LSI version on this node")
            self.dispatcher = None
        else:
            if not onepass:
                raise NotImplementedError("distributed stochastic LSA not implemented yet; "
                                          "run either distributed one-pass, or serial randomized.")
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy('PYRONAME:gensim.lsi_dispatcher@%s' % ns._pyroUri.location)
                dispatcher._pyroOneway.add("exit")
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word = self.id2word, numTopics = numTopics, 
                                      chunks = chunks, decay = decay,
                                      distributed = False, onepass = onepass)
                self.dispatcher = dispatcher
                logger.info("using distributed version with %i workers" % len(dispatcher.getworkers()))
            except Exception, err:
                # distributed version was specifically requested, so this is an error state
                logger.error("failed to initialize distributed LSI (%s)" % err)
                raise RuntimeError("failed to initialize distributed LSI (%s)" % err)
예제 #21
0
    def __init__(self, corpus=None, numTopics=100, id2word=None, distributed=False,
                 chunks=1000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5):
        """
        `numTopics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters on document-topic (theta) and
        topic-word (lambda) distributions. Both default to a symmetric 1.0/numTopics
        (but can be set to a vector, for assymetric priors).

        Turn on `distributed` to force distributed computing (see the web tutorial
        on how to set up a cluster of machines for gensim).

        Example:

        >>> lda = LdaModel(corpus, numTopics=100)
        >>> print lda[doc_bow] # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print lda[doc_bow]

        """
        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')

        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        if self.numTerms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.numTopics = int(numTopics)
        self.chunks = chunks
        self.decay = decay
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every

        if alpha is None:
            self.alpha = 1.0 / numTopics
        else:
            self.alpha = alpha
        if eta is None:
            self.eta = 1.0 / numTopics
        else:
            self.eta = eta

        # VB constants
        self.VAR_MAXITER = 50
        self.VAR_THRESH = 0.001

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            # set up distributed version
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy('PYRONAME:gensim.lda_dispatcher@%s' % ns._pyroUri.location)
                dispatcher._pyroOneway.add("exit")
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=id2word, numTopics=numTopics,
                                      chunks=chunks, alpha=alpha, eta=eta, distributed=False)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers" % self.numworkers)
            except Exception, err:
                logger.error("failed to initialize distributed LDA (%s)" % err)
                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
예제 #22
0
    def __init__(self,
                 corpus=None,
                 id2word=None,
                 numTopics=200,
                 chunks=None,
                 decay=1.0,
                 serial_only=None):
        """
        `numTopics` is the number of requested factors (latent dimensions). 
        
        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary 
        notation. You can also add new training documents, with ``self.addDocuments``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        If you specify a `corpus`, it will be used to train the model. See the 
        method `addDocuments` for a description of the `chunks` and `decay` parameters.
        
        The algorithm will automatically try to find active nodes on other computers
        and run in a distributed manner; if this fails, it falls back to serial mode
        (single core). To suppress distributed computing, set the `serial_only`
        constructor parameter to True.
        
        Example:
        
        >>> lsi = LsiModel(corpus, numTopics = 10)
        >>> print lsi[doc_tfidf]
        >>> lsi.addDocuments(corpus2) # update LSI on additional documents
        >>> print lsi[doc_tfidf]
        
        """
        self.id2word = id2word
        self.numTopics = numTopics  # number of latent topics
        if chunks is None:
            # by default, proceed in chunks as big as number of topics, to improve accuracy
            self.chunks = max(numTopics, 100)
        else:
            self.chunks = chunks
        self.decay = decay

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        self.docs_processed = 0
        self.projection = Projection(self.numTerms, self.numTopics)

        if serial_only:
            logger.info("using slave LSI version on this node")
            self.dispatcher = None
        else:
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy('PYRONAME:gensim.dispatcher@%s' %
                                             ns._pyroUri.location)
                logger.debug("looking for dispatcher at %s" %
                             str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word,
                                      numTopics=numTopics,
                                      chunks=chunks,
                                      decay=decay,
                                      serial_only=True)
                self.dispatcher = dispatcher
                logger.info("using distributed version with %i workers" %
                            len(dispatcher.getworkers()))
            except Exception, err:
                if serial_only is not None:
                    # distributed version was specifically requested, so this is an error state
                    logger.error("failed to initialize distributed LSI (%s)" %
                                 err)
                    raise RuntimeError(
                        "failed to initialize distributed LSI (%s)" % err)
                else:
                    # user didn't request distributed specifically; just let him know we're running in serial
                    logger.info(
                        "distributed LSI not available, running LSI in serial mode (%s)"
                        % err)
                self.dispatcher = None
    def __init__(self, corpus = None, id2word = None, numTopics = 200, 
                 chunks = 10000, decay = 1.0, serial_only = None):
        """
        `numTopics` is the number of requested factors (latent dimensions). 
        
        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary 
        notation. You can also add new training documents, with ``self.addDocuments``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        If you specify a `corpus`, it will be used to train the model. See the 
        method `addDocuments` for a description of the `chunks` and `decay` parameters.
        
        The algorithm will automatically try to find active nodes on other computers
        and run in a distributed manner; if this fails, it falls back to serial mode
        (single core). To suppress distributed computing, set the `serial_only`
        constructor parameter to True.
        
        Example:
        
        >>> lsi = LsiModel(corpus, numTopics = 10)
        >>> print lsi[doc_tfidf]
        >>> lsi.addDocuments(corpus2) # update LSI on additional documents
        >>> print lsi[doc_tfidf]
        
        """
        self.id2word = id2word
        self.numTopics = numTopics # number of latent topics
        self.chunks = chunks
        self.decay = decay
        
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')
        
        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        
        self.docs_processed = 0
        self.projection = Projection(self.numTerms, self.numTopics)

        if serial_only:
            logger.info("using slave LSI version on this node")
            self.dispatcher = None
        else:
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy('PYRONAME:gensim.dispatcher@%s' % ns._pyroUri.location)
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word = self.id2word, numTopics = numTopics, 
                                      chunks = chunks, decay = decay, 
                                      serial_only = True)
                self.dispatcher = dispatcher
                logger.info("using distributed version with %i workers" % len(dispatcher.getworkers()))
            except Exception, err:
                if serial_only is not None: 
                    # distributed version was specifically requested, so this is an error state
                    logger.error("failed to initialize distributed LSI (%s)" % err)
                    raise RuntimeError("failed to initialize distributed LSI (%s)" % err)
                else:
                    # user didn't request distributed specifically; just let him know we're running in serial
                    logger.info("distributed LSI not available, running LSI in serial mode (%s)" % err)
                self.dispatcher = None
    def __init__(self,
                 corpus=None,
                 numTopics=200,
                 id2word=None,
                 distributed=False,
                 chunks=None,
                 alpha=None,
                 initMode='random',
                 dtype=numpy.float64):
        """
        `numTopics` is the number of requested latent topics to be extracted from
        the training corpus. 
        
        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic 
        printing.
        
        `initMode` can be either 'random', for a fast random initialization of 
        the model parameters, or 'seeded', for an initialization based on a handful
        of real documents. The 'seeded' mode requires an extra sweep over the entire 
        input corpus, and is thus much slower.

        `alpha` is either None (to be estimated during training) or a number 
        between (0.0, 1.0).
        
        Turn on `distributed` to force distributed computing (see the web tutorial
        on how to set up a cluster).
        
        Example:
        
        >>> lda = LdaModel(corpus, numTopics=100)
        >>> print lda[doc_tfidf] # get topic probability distribution for a documents
        >>> lda.addDocuments(corpus2) # update LDA with additional documents
        >>> print lda[doc_tfidf]
        
        """
        # store user-supplied parameters
        self.id2word = id2word
        if self.id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        if self.numTerms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.numTopics = int(numTopics)
        self.state = LdaState()
        self.chunks = chunks

        # initialize wordtype/topic counts
        if initMode == 'seeded':  # init from corpus (slow)
            self.state.classWord = self.countsFromCorpus(corpus, numInitDocs=2)
        elif initMode == 'random':  # init with 1/k+noise
            self.state.classWord = 1.0 / self.numTerms + numpy.random.rand(
                self.numTopics, self.numTerms)  # add noise from <0, 1)
        else:
            raise NotImplementedError(
                "LDA initialization mode '%s' not supported" % str(initMode))
        self.state.classWord = self.state.classWord.astype(dtype)

        # internal algorithm constants
        self.estimate_alpha = alpha is None
        if self.estimate_alpha:  # no alpha supplied by user => get some initial estimate
            alpha = 10.0 / numTopics  # n / numTopics, as suggested in Steyvers&Griffiths: Probabilistic Topic Models
        self.alpha = min(0.99999, max(
            0.00001, alpha))  # dirichlet prior; make sure it's within bounds

        # EM training constants
        self.EM_MAX_ITER = 50  # maximum number of EM iterations; usually converges earlier
        self.EM_CONVERGED = 1e-4  # relative difference between two iterations; if lower than this, stop the EM training
        self.VAR_MAX_ITER = 20  # maximum number of document inference iterations
        self.VAR_CONVERGED = 1e-6  # relative difference between document inference iterations needed to stop sooner than VAR_MAX_ITER

        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
        else:
            # set up distributed version
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy(
                    'PYRONAME:gensim.lda_dispatcher@%s' % ns._pyroUri.location)
                dispatcher._pyroOneway.add("exit")
                logger.debug("looking for dispatcher at %s" %
                             str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word,
                                      numTopics=numTopics,
                                      chunks=chunks,
                                      alpha=alpha,
                                      distributed=False)
                self.dispatcher = dispatcher
                logger.info("using distributed version with %i workers" %
                            len(dispatcher.getworkers()))
            except Exception, err:
                logger.error("failed to initialize distributed LDA (%s)" % err)
                raise RuntimeError(
                    "failed to initialize distributed LDA (%s)" % err)
    def __init__(self, corpus=None, numTopics=200, id2word=None, distributed=False, 
                 chunks=None, alpha=None, initMode='random', dtype=numpy.float64):
        """
        `numTopics` is the number of requested latent topics to be extracted from
        the training corpus. 
        
        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic 
        printing.
        
        `initMode` can be either 'random', for a fast random initialization of 
        the model parameters, or 'seeded', for an initialization based on a handful
        of real documents. The 'seeded' mode requires an extra sweep over the entire 
        input corpus, and is thus much slower.

        `alpha` is either None (to be estimated during training) or a number 
        between (0.0, 1.0).
        
        Turn on `distributed` to force distributed computing (see the web tutorial
        on how to set up a cluster).
        
        Example:
        
        >>> lda = LdaModel(corpus, numTopics=100)
        >>> print lda[doc_tfidf] # get topic probability distribution for a documents
        >>> lda.addDocuments(corpus2) # update LDA with additional documents
        >>> print lda[doc_tfidf]
        
        """
        # store user-supplied parameters
        self.id2word = id2word
        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())
        if self.numTerms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")
        
        self.distributed = bool(distributed)
        self.numTopics = int(numTopics)
        self.state = LdaState()
        self.chunks = chunks
        
        # initialize wordtype/topic counts
        if initMode == 'seeded': # init from corpus (slow)
            self.state.classWord = self.countsFromCorpus(corpus, numInitDocs=2)
        elif initMode == 'random': # init with 1/k+noise
            self.state.classWord = 1.0 / self.numTerms + numpy.random.rand(self.numTopics, self.numTerms) # add noise from <0, 1)
        else:
            raise NotImplementedError("LDA initialization mode '%s' not supported" % str(initMode))
        self.state.classWord = self.state.classWord.astype(dtype)
        
        # internal algorithm constants
        self.estimate_alpha = alpha is None
        if self.estimate_alpha: # no alpha supplied by user => get some initial estimate
            alpha = 10.0 / numTopics # n / numTopics, as suggested in Steyvers&Griffiths: Probabilistic Topic Models
        self.alpha = min(0.99999, max(0.00001, alpha)) # dirichlet prior; make sure it's within bounds

        # EM training constants
        self.EM_MAX_ITER = 50 # maximum number of EM iterations; usually converges earlier
        self.EM_CONVERGED = 1e-4 # relative difference between two iterations; if lower than this, stop the EM training 
        self.VAR_MAX_ITER = 20 # maximum number of document inference iterations
        self.VAR_CONVERGED = 1e-6 # relative difference between document inference iterations needed to stop sooner than VAR_MAX_ITER
        
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
        else:
            # set up distributed version
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy('PYRONAME:gensim.lda_dispatcher@%s' % ns._pyroUri.location)
                dispatcher._pyroOneway.add("exit")
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word, numTopics=numTopics, 
                                      chunks=chunks, alpha=alpha, distributed=False)
                self.dispatcher = dispatcher
                logger.info("using distributed version with %i workers" % len(dispatcher.getworkers()))
            except Exception, err:
                logger.error("failed to initialize distributed LDA (%s)" % err)
                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
예제 #26
0
파일: lsimodel.py 프로젝트: andremi/gensim
    def __init__(self,
                 corpus=None,
                 numTopics=200,
                 id2word=None,
                 chunks=20000,
                 decay=1.0,
                 distributed=False,
                 onepass=True,
                 power_iters=P2_EXTRA_ITERS,
                 extra_samples=P2_EXTRA_DIMS):
        """
        `numTopics` is the number of requested factors (latent dimensions). 
        
        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary 
        notation. You can also add new training documents, with ``self.addDocuments``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        If you specify a `corpus`, it will be used to train the model. See the 
        method `addDocuments` for a description of the `chunks` and `decay` parameters.
        
        Turn `onepass` off to force a multi-pass stochastic algorithm.
        
        `power_iters` and `extra_samples` affect the accuracy of the stochastic
        multi-pass algorithm, which is used either internally (`onepass=True`) or
        as the front-end algorithm (`onepass=False`). Increasing the number of 
        power iterations improves accuracy, but lowers performance. See [2]_ for 
        some hard numbers.

        Turn on `distributed` to enable distributed computing.
        
        Example:
        
        >>> lsi = LsiModel(corpus, numTopics=10)
        >>> print lsi[doc_tfidf] # project some document into LSI space
        >>> lsi.addDocuments(corpus2) # update LSI on additional documents
        >>> print lsi[doc_tfidf]
        
        .. [2] http://nlp.fi.muni.cz/~xrehurek/nips/rehurek_nips.pdf
        
        """
        self.id2word = id2word
        self.numTopics = int(numTopics)
        self.chunks = int(chunks)
        self.decay = float(decay)
        if distributed:
            if not onepass:
                logger.warning(
                    "forcing the one-pass algorithm for distributed LSA")
                onepass = True
        self.onepass = onepass
        self.extra_samples, self.power_iters = extra_samples, power_iters

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dictFromCorpus(corpus)
            self.numTerms = len(self.id2word)
        else:
            self.numTerms = 1 + max([-1] + self.id2word.keys())

        self.docs_processed = 0
        self.projection = Projection(self.numTerms, self.numTopics)

        if not distributed:
            logger.info("using serial LSI version on this node")
            self.dispatcher = None
        else:
            if not onepass:
                raise NotImplementedError(
                    "distributed stochastic LSA not implemented yet; "
                    "run either distributed one-pass, or serial randomized.")
            try:
                import Pyro
                ns = Pyro.naming.locateNS()
                dispatcher = Pyro.core.Proxy(
                    'PYRONAME:gensim.lsi_dispatcher@%s' % ns._pyroUri.location)
                dispatcher._pyroOneway.add("exit")
                logger.debug("looking for dispatcher at %s" %
                             str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word,
                                      numTopics=numTopics,
                                      chunks=chunks,
                                      decay=decay,
                                      distributed=False,
                                      onepass=onepass)
                self.dispatcher = dispatcher
                logger.info("using distributed version with %i workers" %
                            len(dispatcher.getworkers()))
            except Exception, err:
                # distributed version was specifically requested, so this is an error state
                logger.error("failed to initialize distributed LSI (%s)" % err)
                raise RuntimeError(
                    "failed to initialize distributed LSI (%s)" % err)
def stochasticSvd(corpus, rank, num_terms=None, chunks=20000, extra_dims=None, dtype=numpy.float64, eps=1e-6):
    """
    Return U, S -- the left singular vectors and the singular values of the streamed 
    input corpus.
    
    This may actually return less than the requested number of top `rank` factors, 
    in case the input is of lower rank. Also note that the decomposition is unique
    up the the sign of the left singular vectors (columns of U).
    
    This is a streamed, two-pass algorithm, without power-iterations. In case you can 
    only afford a single pass over the input corpus, set `onepass=True` in LsiModel and 
    avoid using this algorithm.

    The decomposition algorithm is based on 
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**
    
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    
    if num_terms is None:
        logger.warning("number of terms not provided; will scan the corpus (ONE EXTRA PASS, MAY BE SLOW) to determine it")
        num_terms = len(utils.dictFromCorpus(corpus))
        logger.info("found %i terms" % num_terms)
    else:
        num_terms = int(num_terms)
    
    eps = max(float(eps), 1e-9) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage
    
    # first pass: construct the orthonormal action matrix Q = orth(Y) = orth(A * O)
    # build Y in blocks of `chunks` documents (much faster than going one-by-one 
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype = dtype, shape = (num_terms, samples))
    logger.info("1st pass: constructing %s action matrix" % str(y.shape))
    
    chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
    for chunk_no, (key, group) in enumerate(chunker):
        logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
        # construct the chunk as a sparse matrix, to minimize memory overhead
        # definitely avoid materializing it as a dense (num_terms x chunks) matrix!
        chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
        m, n = chunk.shape
        assert m == num_terms
        assert n <= chunks # the very last chunk of A may be smaller
        logger.debug("multiplying chunk * gauss")
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix
        sparsetools.csc_matvecs(num_terms, n, samples, chunk.indptr, # y = y + chunk * o
                                chunk.indices, chunk.data, o.ravel(), y.ravel())
        del chunk, o
    
    logger.info("orthonormalizing %s action matrix" % str(y.shape))
    q, r = numpy.linalg.qr(y) # orthonormalize the range
    del y # Y not needed anymore, free up mem
    samples = clipSpectrum(numpy.diag(r), samples, discard = eps)
    qt = q[:, :samples].T.copy() # discard bogus columns, in case Y was rank-deficient
    del q
    
    # second pass: construct the covariance matrix X = B * B.T, where B = Q.T * A
    # again, construct X incrementally, in chunks of `chunks` documents from the streaming 
    # input corpus A, to avoid using O(number of documents) memory
    x = numpy.zeros(shape = (samples, samples), dtype = dtype)
    logger.info("2nd pass: constructing %s covariance matrix" % str(x.shape))
    chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks)
    for chunk_no, (key, group) in enumerate(chunker):
        logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
        chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype)
        b = qt * chunk # dense * sparse matrix multiply
        x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
        del chunk, b
    
    # now we're ready to compute decomposition of the small matrix X
    logger.info("computing decomposition of the %s covariance matrix" % str(x.shape))
    u, s, vt = numpy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
    keep = clipSpectrum(s, rank, discard = eps)
    
    logger.info("computing the final decomposition")
    s = numpy.sqrt(s[:keep]) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
    u = numpy.dot(qt.T, u[:, :keep]) # go back from left singular vectors of B to left singular vectors of the corpus
    return u.astype(dtype), s.astype(dtype)
예제 #28
0
def stochasticSvd(corpus,
                  rank,
                  num_terms=None,
                  chunks=20000,
                  extra_dims=None,
                  dtype=numpy.float64,
                  eps=1e-6):
    """
    Return U, S -- the left singular vectors and the singular values of the streamed 
    input corpus.
    
    This may actually return less than the requested number of top `rank` factors, 
    in case the input is of lower rank. Also note that the decomposition is unique
    up the the sign of the left singular vectors (columns of U).
    
    This is a streamed, two-pass algorithm, without power-iterations. In case you can 
    only afford a single pass over the input corpus, set `onepass=True` in LsiModel and 
    avoid using this algorithm.

    The decomposition algorithm is based on 
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**
    
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(
            10, 2 * rank
        )  # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)

    if num_terms is None:
        logger.warning(
            "number of terms not provided; will scan the corpus (ONE EXTRA PASS, MAY BE SLOW) to determine it"
        )
        num_terms = len(utils.dictFromCorpus(corpus))
        logger.info("found %i terms" % num_terms)
    else:
        num_terms = int(num_terms)

    eps = max(
        float(eps), 1e-9
    )  # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage

    # first pass: construct the orthonormal action matrix Q = orth(Y) = orth(A * O)
    # build Y in blocks of `chunks` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st pass: constructing %s action matrix" % str(y.shape))

    chunker = itertools.groupby(enumerate(corpus),
                                key=lambda (docno, doc): docno / chunks)
    for chunk_no, (key, group) in enumerate(chunker):
        logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
        # construct the chunk as a sparse matrix, to minimize memory overhead
        # definitely avoid materializing it as a dense (num_terms x chunks) matrix!
        chunk = matutils.corpus2csc(
            (doc for _, doc in group), num_terms=num_terms,
            dtype=dtype)  # documents = columns of sparse CSC
        m, n = chunk.shape
        assert m == num_terms
        assert n <= chunks  # the very last chunk of A may be smaller
        logger.debug("multiplying chunk * gauss")
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
            dtype)  # draw a random gaussian matrix
        sparsetools.csc_matvecs(
            num_terms,
            n,
            samples,
            chunk.indptr,  # y = y + chunk * o
            chunk.indices,
            chunk.data,
            o.ravel(),
            y.ravel())
        del chunk, o

    logger.info("orthonormalizing %s action matrix" % str(y.shape))
    q, r = numpy.linalg.qr(y)  # orthonormalize the range
    del y  # Y not needed anymore, free up mem
    samples = clipSpectrum(numpy.diag(r), samples, discard=eps)
    qt = q[:, :samples].T.copy(
    )  # discard bogus columns, in case Y was rank-deficient
    del q

    # second pass: construct the covariance matrix X = B * B.T, where B = Q.T * A
    # again, construct X incrementally, in chunks of `chunks` documents from the streaming
    # input corpus A, to avoid using O(number of documents) memory
    x = numpy.zeros(shape=(samples, samples), dtype=dtype)
    logger.info("2nd pass: constructing %s covariance matrix" % str(x.shape))
    chunker = itertools.groupby(enumerate(corpus),
                                key=lambda (docno, doc): docno / chunks)
    for chunk_no, (key, group) in enumerate(chunker):
        logger.info('PROGRESS: at document #%i' % (chunk_no * chunks))
        chunk = matutils.corpus2csc((doc for _, doc in group),
                                    num_terms=num_terms,
                                    dtype=dtype)
        b = qt * chunk  # dense * sparse matrix multiply
        x += numpy.dot(
            b, b.T
        )  # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
        del chunk, b

    # now we're ready to compute decomposition of the small matrix X
    logger.info("computing decomposition of the %s covariance matrix" %
                str(x.shape))
    u, s, vt = numpy.linalg.svd(
        x
    )  # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
    keep = clipSpectrum(s, rank, discard=eps)

    logger.info("computing the final decomposition")
    s = numpy.sqrt(
        s[:keep]
    )  # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
    u = numpy.dot(
        qt.T, u[:, :keep]
    )  # go back from left singular vectors of B to left singular vectors of the corpus
    return u.astype(dtype), s.astype(dtype)