def saveCorpus(fname, corpus, id2word = None): """ Save a corpus in the List-of-words format. """ if id2word is None: logging.info("no word id mapping provided; initializing from corpus") id2word = utils.dictFromCorpus(corpus) logging.info("storing corpus in List-Of-Words format: %s" % fname) truncated = 0 fout = open(fname, 'w') fout.write('%i\n' % len(corpus)) for doc in corpus: words = [] for wordId, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([str(id2word[wordId])] * int(value)) fout.write('%s\n' % ' '.join(words)) fout.close() if truncated: logging.warning("List-of-words format can only save vectors with \ integer entries; %i float entries were truncated to integer value" % truncated)
def saveCorpus(fname, corpus, id2word=None): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logging.info("no word id mapping provided; initializing from corpus") id2word = utils.dictFromCorpus(corpus) numTerms = len(id2word) else: numTerms = 1 + max([-1] + id2word.keys()) logging.info("storing corpus in Blei's LDA-C format: %s" % fname) with open(fname, 'w') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) fout.write("%i %s\n" % (len(doc), ' '.join("%i:%s" % p for p in doc))) fout.close() # write out vocabulary, in a format compatible with Blei's topics.py script fnameVocab = fname + '.vocab' logging.info("saving vocabulary of %i words to %s" % (numTerms, fnameVocab)) fout = open(fnameVocab, 'w') for featureId in xrange(numTerms): fout.write("%s\n" % utils.toUtf8(id2word.get(featureId, '---'))) return offsets
def saveCorpus(fname, corpus, id2word=None): """ Save a corpus in the List-of-words format. This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logging.info( "no word id mapping provided; initializing from corpus") id2word = utils.dictFromCorpus(corpus) logging.info("storing corpus in List-Of-Words format: %s" % fname) truncated = 0 offsets = [] with open(fname, 'w') as fout: fout.write('%i\n' % len(corpus)) for doc in corpus: words = [] for wordId, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([str(id2word[wordId])] * int(value)) offsets.append(fout.tell()) fout.write('%s\n' % ' '.join(words)) if truncated: logging.warning( "List-of-words format can only save vectors with " "integer elements; %i float entries were truncated to integer value" % truncated) return offsets
def saveCorpus(fname, corpus, id2word = None): """ Save a corpus in the Matrix Market format. There are actually two files saved: fname and fname.vocab, where fname.vocab is the vocabulary file. """ if id2word is None: logging.info("no word id mapping provided; initializing from corpus") id2word = utils.dictFromCorpus(corpus) numTerms = len(id2word) else: numTerms = 1 + max([-1] + id2word.keys()) logging.info("storing corpus in Blei's LDA-C format: %s" % fname) fout = open(fname, 'w') for doc in corpus: fout.write("%i %s\n" % (len(doc), ' '.join("%i:%f" % p for p in doc))) fout.close() # write out vocabulary, in a format compatible with Blei's topics.py script fnameVocab = fname + '.vocab' logging.info("saving vocabulary of %i words to %s" % (numTerms, fnameVocab)) fout = open(fnameVocab, 'w') for featureId in xrange(numTerms): fout.write("%s\n" % id2word.get(featureId, '---')) fout.close()
def saveCorpus(fname, corpus, id2word = None): """ Save a corpus in the Matrix Market format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. """ if id2word is None: logging.info("no word id mapping provided; initializing from corpus") id2word = utils.dictFromCorpus(corpus) numTerms = len(id2word) else: numTerms = 1 + max([-1] + id2word.keys()) logging.info("storing corpus in Blei's LDA-C format: %s" % fname) fout = open(fname, 'w') for doc in corpus: doc = list(doc) fout.write("%i %s\n" % (len(doc), ' '.join("%i:%s" % p for p in doc))) fout.close() # write out vocabulary, in a format compatible with Blei's topics.py script fnameVocab = fname + '.vocab' logging.info("saving vocabulary of %i words to %s" % (numTerms, fnameVocab)) fout = open(fnameVocab, 'w') for featureId in xrange(numTerms): fout.write("%s\n" % utils.toUtf8(id2word.get(featureId, '---'))) fout.close()
def initialize(self, corpus): """ Compute inverse document weights, which will be used to modify term frequencies for documents. """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) logging.info("calculating IDF weights over %i documents" % len(corpus)) idfs = {} numNnz = 0 for docNo, bow in enumerate(corpus): if docNo % 5000 == 0: logging.info("PROGRESS: processing document %i/%i" % (docNo, len(corpus))) numNnz += len(bow) for termId, termCount in bow: idfs[termId] = idfs.get(termId, 0) + 1 idfs = dict((termId, math.log(1.0 * (docNo + 1) / docFreq, 2)) # the IDF weight formula for termId, docFreq in idfs.iteritems()) self.idfs = idfs # keep some stats about the training corpus self.numDocs = len(corpus) self.numNnz = numNnz
def initialize(self, corpus, chunks = 100, keepDecomposition = False, dtype = numpy.float64): """ Run SVD decomposition on the corpus. This will define the latent space into which terms and documents will be mapped. The SVD is created incrementally, in blocks of `chunks` documents. In the end, a `self.projection` matrix is constructed that can be used to transform documents into the latent space. The `U, S, V` decomposition itself is discarded, unless `keepDecomposition` is True, in which case it is stored in `self.u`, `self.s` and `self.v`. `dtype` dictates precision used for intermediate computations; the final projection will however always be of type numpy.float32. The algorithm is adapted from: **M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition** """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # initialize decomposition (zero documents so far) self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics)), dtype = dtype) self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)), dtype = dtype) #self.v = numpy.matrix(numpy.zeros((0, self.numTopics)), dtype = dtype) self.v = None # do the actual work -- perform iterative singular value decomposition. # this is done by sequentially updating SVD with `chunks` new documents chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks) for chunkNo, (key, group) in enumerate(chunker): # convert the chunk of sparse documents to full vectors docs = [matutils.sparse2full(doc, self.numTerms) for docNo, doc in group] # self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents self.svdAddCols(docs, reorth = False) logging.info("processed documents up to #%s" % docNo) # calculate projection needed to get document-topic matrix from term-document matrix. # # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x, # so the projection is self.s^-1 * self.u^-1. # # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so # we pre-multiply v * s (ie., scale axes by singular values), and return # that directly as the representation of `x` in LSI space. # # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is # just self.u.T # # note that neither `v` (the right singular vectors) nor `s` (the singular # values) are used at all in the transformation self.projection = self.u.T.astype(numpy.float32).copy('C') if not keepDecomposition: # once we have the projection stored in self, discard u*s*v decomposition to free up memory del self.u, self.v
def saveCorpus(fname, corpus, id2word=None): """ Save a corpus in the List-of-words format. This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logging.info("no word id mapping provided; initializing from corpus") id2word = utils.dictFromCorpus(corpus) logging.info("storing corpus in List-Of-Words format: %s" % fname) truncated = 0 offsets = [] with open(fname, "w") as fout: fout.write("%i\n" % len(corpus)) for doc in corpus: words = [] for wordId, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([str(id2word[wordId])] * int(value)) offsets.append(fout.tell()) fout.write("%s\n" % " ".join(words)) if truncated: logging.warning( "List-of-words format can only save vectors with " "integer elements; %i float entries were truncated to integer value" % truncated ) return offsets
def initialize(self, corpus, chunks=100, keepDecomposition=False, dtype=numpy.float64): """ Run SVD decomposition on the corpus. This will define the latent space into which terms and documents will be mapped. The SVD is created incrementally, in blocks of `chunks` documents. In the end, a `self.projection` matrix is constructed that can be used to transform documents into the latent space. The `U, S, V` decomposition itself is discarded, unless `keepDecomposition` is True, in which case it is stored in `self.u`, `self.s` and `self.v`. `dtype` dictates precision used for intermediate computations; the final projection will however always be of type numpy.float32. The algorithm is adapted from: **M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition** """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # initialize decomposition (zero documents so far) self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics)), dtype=dtype) self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics)), dtype=dtype) # self.v = numpy.matrix(numpy.zeros((0, self.numTopics)), dtype = dtype) self.v = None # do the actual work -- perform iterative singular value decomposition. # this is done by sequentially updating SVD with `chunks` new documents chunker = itertools.groupby(enumerate(corpus), key=lambda val: val[0] / chunks) for chunkNo, (key, group) in enumerate(chunker): # convert the chunk of sparse documents to full vectors docs = [matutils.sparse2full(doc, self.numTerms) for docNo, doc in group] # self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents self.svdAddCols(docs, reorth=False) logging.info("processed documents up to #%s" % docNo) # calculate projection needed to get document-topic matrix from term-document matrix. # # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x, # so the projection is self.s^-1 * self.u^-1. # # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so # we pre-multiply v * s (ie., scale axes by singular values), and return # that directly as the representation of `x` in LSI space. # # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is # just self.u.T # # note that neither `v` (the right singular vectors) nor `s` (the singular # values) are used at all in the transformation self.projection = self.u.T.astype(numpy.float32).copy("C") if not keepDecomposition: # once we have the projection stored in self, discard u*s*v decomposition to free up memory del self.u, self.v
def __init__(self, corpus = None, id2word = None, numTopics = 200, extraDims = 10, chunks = 100, dtype = numpy.float64): """ `numTopics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.addDocuments``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. `extraDims` is the number of extra dimensions that will be internally computed (ie. `numTopics + extraDims`) to improve numerical properties of the SVD algorithm. These extra dimensions will be eventually chopped off for the final projection. Set to 0 to save memory; set to ~10 to 2*numTopics for increased SVD precision. If you specify a `corpus`, it will be used to train the model. See the method `addDocuments` for a description of the `chunks` and `decay` parameters. The algorithm is based on **Brand, 2006: Fast low-rank modifications of the thin singular value decomposition**. Example: >>> lsi = LsiModel(corpus, numTopics = 10) >>> print lsi[doc_tfidf] >>> lsi.addDocuments(corpus2) # update LSI on additional documents >>> print lsi[doc_tfidf] """ self.id2word = id2word self.numTopics = numTopics # number of latent topics self.extraDims = extraDims self.dtype = dtype if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) self.projection = numpy.asmatrix(numpy.zeros((self.numTopics, self.numTerms), dtype = dtype)) self.u = None self.s = numpy.asmatrix(numpy.zeros((self.numTopics + self.extraDims, self.numTopics + self.extraDims)), dtype = dtype) self.v = None if corpus is not None: self.addDocuments(corpus, chunks = chunks, updateProjection = True)
def initialize(self, corpus): """ Initialize the random projection matrix. """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # Now construct the projection matrix itself. # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). randmat = 1 - 2 * numpy.random.binomial(1, 0.5, (self.numTopics, self.numTerms)) # convert from 0/1 to +1/-1 self.projection = numpy.asmatrix(randmat, dtype = numpy.float32) # convert from int32 to floats, for faster multiplications
def initialize(self, corpus): """ Initialize the random projection matrix. """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # Now construct the projection matrix itself. # # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). tmp = numpy.random.binomial(1, 0.5, (self.numTopics, self.numTerms)) # FIXME temporary array unnecessarily big (int32 -> int8) self.projection = numpy.asmatrix(1 - 2 * tmp.astype(numpy.int8)) # convert from 0/1 to +1/-1
def initialize(self, corpus): """ Initialize the random projection matrix. """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) shape = self.numTopics, self.numTerms logger.info("constructing %s random matrix" % str(shape)) # Now construct the projection matrix itself. # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). randmat = 1 - 2 * numpy.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1 self.projection = numpy.asfortranarray(randmat, dtype=numpy.float32) # convert from int32 to floats, for faster multiplications
def __init__(self, corpus, id2word = None, numTopics = 200, alpha = None, initMode = 'random'): """ Initialize the model based on corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `numTopics` is the number of requested topics. `alpha` is either None (to be estimated during training) or a number between (0.0, 1.0). """ # store user-supplied parameters self.id2word = id2word if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) self.numTopics = numTopics # number of latent topics # internal constants; can be manually changed after having called this constructor and before calling `initialize()` self.ESTIMATE_ALPHA = alpha is None if alpha is None: # no alpha supplied by user => get some initial estimate alpha = 10.0 / numTopics # initial estimate is 50 / numTopics, as suggested in Steyvers&Griffiths: Probabilistic Topic Models self.alpha = min(0.99999, max(0.00001, alpha)) # dirichlet prior; make sure it's within bounds # set EM training constants self.EM_MAX_ITER = 50 # maximum number of EM iterations; usually converges much earlier self.EM_CONVERGED = 0.0001 # relative difference between two iterations; if lower than this, stop the EM training self.VAR_MAX_ITER = 20 # maximum number of document inference iterations self.VAR_CONVERGED = 0.000001 # relative difference between document inference iterations needed to stop sooner than VAR_MAX_ITER if corpus is not None: self.initialize(corpus, initMode)
def saveCorpus(fname, corpus, id2word=None): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logging.info( "no word id mapping provided; initializing from corpus") id2word = utils.dictFromCorpus(corpus) numTerms = len(id2word) else: numTerms = 1 + max([-1] + id2word.keys()) logging.info("storing corpus in Blei's LDA-C format: %s" % fname) with open(fname, 'w') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) fout.write("%i %s\n" % (len(doc), ' '.join("%i:%s" % p for p in doc))) fout.close() # write out vocabulary, in a format compatible with Blei's topics.py script fnameVocab = fname + '.vocab' logging.info("saving vocabulary of %i words to %s" % (numTerms, fnameVocab)) fout = open(fnameVocab, 'w') for featureId in xrange(numTerms): fout.write("%s\n" % utils.toUtf8(id2word.get(featureId, '---'))) return offsets
def initialize(self, corpus): """ Initialize the random projection matrix. """ if self.id2word is None: logging.info( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # Now construct the projection matrix itself. # # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). tmp = numpy.random.binomial( 1, 0.5, (self.numTopics, self.numTerms )) # FIXME temporary array unnecessarily big (int32 -> int8) self.projection = numpy.asmatrix( 1 - 2 * tmp.astype(numpy.int8)) # convert from 0/1 to +1/-1
def __init__(self, corpus=None, numTopics=200, id2word=None, chunks=20000, decay=1.0, distributed=False, onepass=False): """ `numTopics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.addDocuments``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. If you specify a `corpus`, it will be used to train the model. See the method `addDocuments` for a description of the `chunks` and `decay` parameters. If your document stream is one-pass only (the stream cannot be repeated), turn on `onepass` to force a single pass SVD algorithm (slower). Turn on `distributed` to enforce distributed computing (only makes sense if `onepass` is set at the same time, too). Example: >>> lsi = LsiModel(corpus, numTopics=10) >>> print lsi[doc_tfidf] # project some document into LSI space >>> lsi.addDocuments(corpus2) # update LSI on additional documents >>> print lsi[doc_tfidf] """ self.id2word = id2word self.numTopics = int(numTopics) self.chunks = int(chunks) self.decay = float(decay) if distributed: if not onepass: logger.warning( "forcing the one-pass algorithm for distributed LSA") onepass = True self.onepass = onepass if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.info( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) self.docs_processed = 0 self.projection = Projection(self.numTerms, self.numTopics) if not distributed: logger.info("using serial LSI version on this node") self.dispatcher = None else: if not onepass: raise NotImplementedError( "distributed stochastic LSA not implemented yet; " "run either distributed one-pass, or serial randomized.") try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy('PYRONAME:gensim.dispatcher@%s' % ns._pyroUri.location) logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, numTopics=numTopics, chunks=chunks, decay=decay, distributed=False, onepass=onepass) self.dispatcher = dispatcher logger.info("using distributed version with %i workers" % len(dispatcher.getworkers())) except Exception, err: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)" % err) raise RuntimeError( "failed to initialize distributed LSI (%s)" % err)
def __init__(self, corpus=None, numTopics=100, id2word=None, distributed=False, chunks=10000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5): """ `numTopics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters on document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/numTopics (but can be set to a vector, for assymetric priors). Turn on `distributed` to force distributed computing (see the web tutorial on how to set up a cluster of machines for gensim). Example: >>> lda = LdaModel(corpus, numTopics=100) >>> print lda[doc_bow] # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print lda[doc_bow] """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) if self.numTerms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.numTopics = int(numTopics) self.chunks = chunks self.decay = decay self.num_updates = 0 self.passes = passes self.update_every = update_every if alpha is None: self.alpha = 1.0 / numTopics else: self.alpha = alpha if eta is None: self.eta = 1.0 / numTopics else: self.eta = eta # VB constants self.VAR_MAXITER = 50 self.VAR_THRESH = 0.001 # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: # set up distributed version try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy('PYRONAME:gensim.lda_dispatcher@%s' % ns._pyroUri.location) dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=id2word, numTopics=numTopics, chunks=chunks, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception, err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
def __init__(self, corpus=None, id2word=None, numTopics=200, extraDims=10, chunks=100, dtype=numpy.float64): """ `numTopics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.addDocuments``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. `extraDims` is the number of extra dimensions that will be internally computed (ie. `numTopics + extraDims`) to improve numerical properties of the SVD algorithm. These extra dimensions will be eventually chopped off for the final projection. Set to 0 to save memory; set to ~10 to 2*numTopics for increased SVD precision. If you specify a `corpus`, it will be used to train the model. See the method `addDocuments` for a description of the `chunks` and `decay` parameters. The algorithm is based on **Brand, 2006: Fast low-rank modifications of the thin singular value decomposition**. Example: >>> lsi = LsiModel(corpus, numTopics = 10) >>> print lsi[doc_tfidf] >>> lsi.addDocuments(corpus2) # update LSI on additional documents >>> print lsi[doc_tfidf] """ self.id2word = id2word self.numTopics = numTopics # number of latent topics self.extraDims = extraDims self.dtype = dtype if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logging.info( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) self.projection = numpy.asmatrix( numpy.zeros((self.numTopics, self.numTerms), dtype=dtype)) self.u = None self.s = numpy.asmatrix(numpy.zeros((self.numTopics + self.extraDims, self.numTopics + self.extraDims)), dtype=dtype) self.v = None if corpus is not None: self.addDocuments(corpus, chunks=chunks, updateProjection=True)
def __init__(self, corpus=None, numTopics=200, id2word=None, chunks=20000, decay=1.0, distributed=False, onepass=False): """ `numTopics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.addDocuments``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. If you specify a `corpus`, it will be used to train the model. See the method `addDocuments` for a description of the `chunks` and `decay` parameters. If your document stream is one-pass only (the stream cannot be repeated), turn on `onepass` to force a single pass SVD algorithm (slower). Turn on `distributed` to force distributed computing. Example: >>> lsi = LsiModel(corpus, numTopics=10) >>> print lsi[doc_tfidf] # project some document into LSI space >>> lsi.addDocuments(corpus2) # update LSI on additional documents >>> print lsi[doc_tfidf] """ self.id2word = id2word self.numTopics = int(numTopics) self.chunks = int(chunks) self.decay = float(decay) if distributed: if not onepass: logger.warning("forcing the one-pass algorithm for distributed LSA") onepass = True self.onepass = onepass if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) self.docs_processed = 0 self.projection = Projection(self.numTerms, self.numTopics) if not distributed: logger.info("using serial LSI version on this node") self.dispatcher = None else: if not onepass: raise NotImplementedError("distributed stochastic LSA not implemented yet; " "run either distributed one-pass, or serial randomized.") try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy('PYRONAME:gensim.lsi_dispatcher@%s' % ns._pyroUri.location) dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word = self.id2word, numTopics = numTopics, chunks = chunks, decay = decay, distributed = False, onepass = onepass) self.dispatcher = dispatcher logger.info("using distributed version with %i workers" % len(dispatcher.getworkers())) except Exception, err: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)" % err) raise RuntimeError("failed to initialize distributed LSI (%s)" % err)
def __init__(self, corpus=None, numTopics=100, id2word=None, distributed=False, chunks=1000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5): """ `numTopics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters on document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/numTopics (but can be set to a vector, for assymetric priors). Turn on `distributed` to force distributed computing (see the web tutorial on how to set up a cluster of machines for gensim). Example: >>> lda = LdaModel(corpus, numTopics=100) >>> print lda[doc_bow] # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print lda[doc_bow] """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) if self.numTerms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.numTopics = int(numTopics) self.chunks = chunks self.decay = decay self.num_updates = 0 self.passes = passes self.update_every = update_every if alpha is None: self.alpha = 1.0 / numTopics else: self.alpha = alpha if eta is None: self.eta = 1.0 / numTopics else: self.eta = eta # VB constants self.VAR_MAXITER = 50 self.VAR_THRESH = 0.001 # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: # set up distributed version try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy('PYRONAME:gensim.lda_dispatcher@%s' % ns._pyroUri.location) dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=id2word, numTopics=numTopics, chunks=chunks, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception, err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
def __init__(self, corpus=None, id2word=None, numTopics=200, chunks=None, decay=1.0, serial_only=None): """ `numTopics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.addDocuments``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. If you specify a `corpus`, it will be used to train the model. See the method `addDocuments` for a description of the `chunks` and `decay` parameters. The algorithm will automatically try to find active nodes on other computers and run in a distributed manner; if this fails, it falls back to serial mode (single core). To suppress distributed computing, set the `serial_only` constructor parameter to True. Example: >>> lsi = LsiModel(corpus, numTopics = 10) >>> print lsi[doc_tfidf] >>> lsi.addDocuments(corpus2) # update LSI on additional documents >>> print lsi[doc_tfidf] """ self.id2word = id2word self.numTopics = numTopics # number of latent topics if chunks is None: # by default, proceed in chunks as big as number of topics, to improve accuracy self.chunks = max(numTopics, 100) else: self.chunks = chunks self.decay = decay if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.info( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) self.docs_processed = 0 self.projection = Projection(self.numTerms, self.numTopics) if serial_only: logger.info("using slave LSI version on this node") self.dispatcher = None else: try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy('PYRONAME:gensim.dispatcher@%s' % ns._pyroUri.location) logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, numTopics=numTopics, chunks=chunks, decay=decay, serial_only=True) self.dispatcher = dispatcher logger.info("using distributed version with %i workers" % len(dispatcher.getworkers())) except Exception, err: if serial_only is not None: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)" % err) raise RuntimeError( "failed to initialize distributed LSI (%s)" % err) else: # user didn't request distributed specifically; just let him know we're running in serial logger.info( "distributed LSI not available, running LSI in serial mode (%s)" % err) self.dispatcher = None
def __init__(self, corpus = None, id2word = None, numTopics = 200, chunks = 10000, decay = 1.0, serial_only = None): """ `numTopics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.addDocuments``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. If you specify a `corpus`, it will be used to train the model. See the method `addDocuments` for a description of the `chunks` and `decay` parameters. The algorithm will automatically try to find active nodes on other computers and run in a distributed manner; if this fails, it falls back to serial mode (single core). To suppress distributed computing, set the `serial_only` constructor parameter to True. Example: >>> lsi = LsiModel(corpus, numTopics = 10) >>> print lsi[doc_tfidf] >>> lsi.addDocuments(corpus2) # update LSI on additional documents >>> print lsi[doc_tfidf] """ self.id2word = id2word self.numTopics = numTopics # number of latent topics self.chunks = chunks self.decay = decay if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) self.docs_processed = 0 self.projection = Projection(self.numTerms, self.numTopics) if serial_only: logger.info("using slave LSI version on this node") self.dispatcher = None else: try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy('PYRONAME:gensim.dispatcher@%s' % ns._pyroUri.location) logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word = self.id2word, numTopics = numTopics, chunks = chunks, decay = decay, serial_only = True) self.dispatcher = dispatcher logger.info("using distributed version with %i workers" % len(dispatcher.getworkers())) except Exception, err: if serial_only is not None: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)" % err) raise RuntimeError("failed to initialize distributed LSI (%s)" % err) else: # user didn't request distributed specifically; just let him know we're running in serial logger.info("distributed LSI not available, running LSI in serial mode (%s)" % err) self.dispatcher = None
def __init__(self, corpus=None, numTopics=200, id2word=None, distributed=False, chunks=None, alpha=None, initMode='random', dtype=numpy.float64): """ `numTopics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `initMode` can be either 'random', for a fast random initialization of the model parameters, or 'seeded', for an initialization based on a handful of real documents. The 'seeded' mode requires an extra sweep over the entire input corpus, and is thus much slower. `alpha` is either None (to be estimated during training) or a number between (0.0, 1.0). Turn on `distributed` to force distributed computing (see the web tutorial on how to set up a cluster). Example: >>> lda = LdaModel(corpus, numTopics=100) >>> print lda[doc_tfidf] # get topic probability distribution for a documents >>> lda.addDocuments(corpus2) # update LDA with additional documents >>> print lda[doc_tfidf] """ # store user-supplied parameters self.id2word = id2word if self.id2word is None: logger.info( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) if self.numTerms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.numTopics = int(numTopics) self.state = LdaState() self.chunks = chunks # initialize wordtype/topic counts if initMode == 'seeded': # init from corpus (slow) self.state.classWord = self.countsFromCorpus(corpus, numInitDocs=2) elif initMode == 'random': # init with 1/k+noise self.state.classWord = 1.0 / self.numTerms + numpy.random.rand( self.numTopics, self.numTerms) # add noise from <0, 1) else: raise NotImplementedError( "LDA initialization mode '%s' not supported" % str(initMode)) self.state.classWord = self.state.classWord.astype(dtype) # internal algorithm constants self.estimate_alpha = alpha is None if self.estimate_alpha: # no alpha supplied by user => get some initial estimate alpha = 10.0 / numTopics # n / numTopics, as suggested in Steyvers&Griffiths: Probabilistic Topic Models self.alpha = min(0.99999, max( 0.00001, alpha)) # dirichlet prior; make sure it's within bounds # EM training constants self.EM_MAX_ITER = 50 # maximum number of EM iterations; usually converges earlier self.EM_CONVERGED = 1e-4 # relative difference between two iterations; if lower than this, stop the EM training self.VAR_MAX_ITER = 20 # maximum number of document inference iterations self.VAR_CONVERGED = 1e-6 # relative difference between document inference iterations needed to stop sooner than VAR_MAX_ITER if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None else: # set up distributed version try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy( 'PYRONAME:gensim.lda_dispatcher@%s' % ns._pyroUri.location) dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, numTopics=numTopics, chunks=chunks, alpha=alpha, distributed=False) self.dispatcher = dispatcher logger.info("using distributed version with %i workers" % len(dispatcher.getworkers())) except Exception, err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError( "failed to initialize distributed LDA (%s)" % err)
def __init__(self, corpus=None, numTopics=200, id2word=None, distributed=False, chunks=None, alpha=None, initMode='random', dtype=numpy.float64): """ `numTopics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `initMode` can be either 'random', for a fast random initialization of the model parameters, or 'seeded', for an initialization based on a handful of real documents. The 'seeded' mode requires an extra sweep over the entire input corpus, and is thus much slower. `alpha` is either None (to be estimated during training) or a number between (0.0, 1.0). Turn on `distributed` to force distributed computing (see the web tutorial on how to set up a cluster). Example: >>> lda = LdaModel(corpus, numTopics=100) >>> print lda[doc_tfidf] # get topic probability distribution for a documents >>> lda.addDocuments(corpus2) # update LDA with additional documents >>> print lda[doc_tfidf] """ # store user-supplied parameters self.id2word = id2word if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) if self.numTerms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.numTopics = int(numTopics) self.state = LdaState() self.chunks = chunks # initialize wordtype/topic counts if initMode == 'seeded': # init from corpus (slow) self.state.classWord = self.countsFromCorpus(corpus, numInitDocs=2) elif initMode == 'random': # init with 1/k+noise self.state.classWord = 1.0 / self.numTerms + numpy.random.rand(self.numTopics, self.numTerms) # add noise from <0, 1) else: raise NotImplementedError("LDA initialization mode '%s' not supported" % str(initMode)) self.state.classWord = self.state.classWord.astype(dtype) # internal algorithm constants self.estimate_alpha = alpha is None if self.estimate_alpha: # no alpha supplied by user => get some initial estimate alpha = 10.0 / numTopics # n / numTopics, as suggested in Steyvers&Griffiths: Probabilistic Topic Models self.alpha = min(0.99999, max(0.00001, alpha)) # dirichlet prior; make sure it's within bounds # EM training constants self.EM_MAX_ITER = 50 # maximum number of EM iterations; usually converges earlier self.EM_CONVERGED = 1e-4 # relative difference between two iterations; if lower than this, stop the EM training self.VAR_MAX_ITER = 20 # maximum number of document inference iterations self.VAR_CONVERGED = 1e-6 # relative difference between document inference iterations needed to stop sooner than VAR_MAX_ITER if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None else: # set up distributed version try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy('PYRONAME:gensim.lda_dispatcher@%s' % ns._pyroUri.location) dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, numTopics=numTopics, chunks=chunks, alpha=alpha, distributed=False) self.dispatcher = dispatcher logger.info("using distributed version with %i workers" % len(dispatcher.getworkers())) except Exception, err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
def __init__(self, corpus=None, numTopics=200, id2word=None, chunks=20000, decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS): """ `numTopics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.addDocuments``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. If you specify a `corpus`, it will be used to train the model. See the method `addDocuments` for a description of the `chunks` and `decay` parameters. Turn `onepass` off to force a multi-pass stochastic algorithm. `power_iters` and `extra_samples` affect the accuracy of the stochastic multi-pass algorithm, which is used either internally (`onepass=True`) or as the front-end algorithm (`onepass=False`). Increasing the number of power iterations improves accuracy, but lowers performance. See [2]_ for some hard numbers. Turn on `distributed` to enable distributed computing. Example: >>> lsi = LsiModel(corpus, numTopics=10) >>> print lsi[doc_tfidf] # project some document into LSI space >>> lsi.addDocuments(corpus2) # update LSI on additional documents >>> print lsi[doc_tfidf] .. [2] http://nlp.fi.muni.cz/~xrehurek/nips/rehurek_nips.pdf """ self.id2word = id2word self.numTopics = int(numTopics) self.chunks = int(chunks) self.decay = float(decay) if distributed: if not onepass: logger.warning( "forcing the one-pass algorithm for distributed LSA") onepass = True self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.info( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dictFromCorpus(corpus) self.numTerms = len(self.id2word) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) self.docs_processed = 0 self.projection = Projection(self.numTerms, self.numTopics) if not distributed: logger.info("using serial LSI version on this node") self.dispatcher = None else: if not onepass: raise NotImplementedError( "distributed stochastic LSA not implemented yet; " "run either distributed one-pass, or serial randomized.") try: import Pyro ns = Pyro.naming.locateNS() dispatcher = Pyro.core.Proxy( 'PYRONAME:gensim.lsi_dispatcher@%s' % ns._pyroUri.location) dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, numTopics=numTopics, chunks=chunks, decay=decay, distributed=False, onepass=onepass) self.dispatcher = dispatcher logger.info("using distributed version with %i workers" % len(dispatcher.getworkers())) except Exception, err: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)" % err) raise RuntimeError( "failed to initialize distributed LSI (%s)" % err)
def stochasticSvd(corpus, rank, num_terms=None, chunks=20000, extra_dims=None, dtype=numpy.float64, eps=1e-6): """ Return U, S -- the left singular vectors and the singular values of the streamed input corpus. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. Also note that the decomposition is unique up the the sign of the left singular vectors (columns of U). This is a streamed, two-pass algorithm, without power-iterations. In case you can only afford a single pass over the input corpus, set `onepass=True` in LsiModel and avoid using this algorithm. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** """ rank = int(rank) if extra_dims is None: samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) if num_terms is None: logger.warning("number of terms not provided; will scan the corpus (ONE EXTRA PASS, MAY BE SLOW) to determine it") num_terms = len(utils.dictFromCorpus(corpus)) logger.info("found %i terms" % num_terms) else: num_terms = int(num_terms) eps = max(float(eps), 1e-9) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage # first pass: construct the orthonormal action matrix Q = orth(Y) = orth(A * O) # build Y in blocks of `chunks` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype = dtype, shape = (num_terms, samples)) logger.info("1st pass: constructing %s action matrix" % str(y.shape)) chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunks) matrix! chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunks # the very last chunk of A may be smaller logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(num_terms, n, samples, chunk.indptr, # y = y + chunk * o chunk.indices, chunk.data, o.ravel(), y.ravel()) del chunk, o logger.info("orthonormalizing %s action matrix" % str(y.shape)) q, r = numpy.linalg.qr(y) # orthonormalize the range del y # Y not needed anymore, free up mem samples = clipSpectrum(numpy.diag(r), samples, discard = eps) qt = q[:, :samples].T.copy() # discard bogus columns, in case Y was rank-deficient del q # second pass: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunks` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape = (samples, samples), dtype = dtype) logger.info("2nd pass: constructing %s covariance matrix" % str(x.shape)) chunker = itertools.groupby(enumerate(corpus), key = lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) b = qt * chunk # dense * sparse matrix multiply x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del chunk, b # now we're ready to compute decomposition of the small matrix X logger.info("computing decomposition of the %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) keep = clipSpectrum(s, rank, discard = eps) logger.info("computing the final decomposition") s = numpy.sqrt(s[:keep]) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus u = numpy.dot(qt.T, u[:, :keep]) # go back from left singular vectors of B to left singular vectors of the corpus return u.astype(dtype), s.astype(dtype)
def stochasticSvd(corpus, rank, num_terms=None, chunks=20000, extra_dims=None, dtype=numpy.float64, eps=1e-6): """ Return U, S -- the left singular vectors and the singular values of the streamed input corpus. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. Also note that the decomposition is unique up the the sign of the left singular vectors (columns of U). This is a streamed, two-pass algorithm, without power-iterations. In case you can only afford a single pass over the input corpus, set `onepass=True` in LsiModel and avoid using this algorithm. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** """ rank = int(rank) if extra_dims is None: samples = max( 10, 2 * rank ) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) if num_terms is None: logger.warning( "number of terms not provided; will scan the corpus (ONE EXTRA PASS, MAY BE SLOW) to determine it" ) num_terms = len(utils.dictFromCorpus(corpus)) logger.info("found %i terms" % num_terms) else: num_terms = int(num_terms) eps = max( float(eps), 1e-9 ) # must ignore near-zero eigenvalues (probably numerical error); the associated eigenvectors are typically unstable/garbage # first pass: construct the orthonormal action matrix Q = orth(Y) = orth(A * O) # build Y in blocks of `chunks` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st pass: constructing %s action matrix" % str(y.shape)) chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunks) matrix! chunk = matutils.corpus2csc( (doc for _, doc in group), num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunks # the very last chunk of A may be smaller logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( num_terms, n, samples, chunk.indptr, # y = y + chunk * o chunk.indices, chunk.data, o.ravel(), y.ravel()) del chunk, o logger.info("orthonormalizing %s action matrix" % str(y.shape)) q, r = numpy.linalg.qr(y) # orthonormalize the range del y # Y not needed anymore, free up mem samples = clipSpectrum(numpy.diag(r), samples, discard=eps) qt = q[:, :samples].T.copy( ) # discard bogus columns, in case Y was rank-deficient del q # second pass: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunks` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(samples, samples), dtype=dtype) logger.info("2nd pass: constructing %s covariance matrix" % str(x.shape)) chunker = itertools.groupby(enumerate(corpus), key=lambda (docno, doc): docno / chunks) for chunk_no, (key, group) in enumerate(chunker): logger.info('PROGRESS: at document #%i' % (chunk_no * chunks)) chunk = matutils.corpus2csc((doc for _, doc in group), num_terms=num_terms, dtype=dtype) b = qt * chunk # dense * sparse matrix multiply x += numpy.dot( b, b.T ) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del chunk, b # now we're ready to compute decomposition of the small matrix X logger.info("computing decomposition of the %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd( x ) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) keep = clipSpectrum(s, rank, discard=eps) logger.info("computing the final decomposition") s = numpy.sqrt( s[:keep] ) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus u = numpy.dot( qt.T, u[:, :keep] ) # go back from left singular vectors of B to left singular vectors of the corpus return u.astype(dtype), s.astype(dtype)