def initialize(self, corpus, chunks = 100, keepDecomposition = False): """ Run SVD decomposition on the corpus. This will define the latent space into which terms and documents will be mapped. The SVD is created incrementally, in blocks of `chunks` documents. In the end, a `self.projection` matrix is constructed that can be used to transform documents into the latent space. The `U, S, V` decomposition itself is discarded, unless `keepDecomposition` is True, in which case it is stored in `self.u`, `self.s` and `self.v`. The algorithm is adapted from: **M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition** """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") maxId = -1 for document in corpus: maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document])) self.numTerms = 1 + maxId self.id2word = dict(zip(xrange(self.numTerms), xrange(self.numTerms))) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # initialize decomposition (zero documents so far) self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics))) # leave default numeric type (=double) self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics))) #self.v = numpy.matrix(numpy.zeros((0, self.numTopics))) self.v = None # do the actual work -- perform iterative singular value decomposition # this is done by sequentially updating SVD with `chunks` new documents chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks) for chunkNo, (key, group) in enumerate(chunker): # convert the chunk of documents to vectors docs = [matutils.doc2vec(doc, self.numTerms) for docNo, doc in group] # self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents self.svdAddCols(docs, reorth = False) logging.info("processed documents up to #%s" % docNo) # calculate projection needed to get document-topic matrix from term-document matrix. # # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x, # so the projection is self.s^-1 * self.u^-1. # # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so # we pre-multiply v * s (ie., scale axes by singular values), and return # that directly as the representation of `x` in LSI space. # # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is # just self.u.T # # note that neither `v` (the right singular vectors) nor `s` (the singular # values) are used at all in the transformation self.projection = self.u.T if not keepDecomposition: # once we have the projection stored in self, discard u*s*v decomposition to free up memory del self.u, self.v
def initialize(self, corpus, chunks = 100, keepDecomposition = False): """ Run SVD decomposition on the corpus. This will define the latent space into which terms and documents will be mapped. The SVD is created incrementally, in blocks of `chunks` documents. In the end, a `self.projection` matrix is constructed that can be used to transform documents into the latent space. The `U, S, V` decomposition itself is discarded, unless keepDecomposition is True, in which case it is stored in `self.u`, `self.s` and `self.v`. The algorithm is adapted from: M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") maxId = -1 for document in corpus: maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document])) self.numTerms = 1 + maxId self.id2word = dict(zip(xrange(self.numTerms), xrange(self.numTerms))) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # initialize decomposition (zero documents so far) self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics))) # leave default numeric type (=double) self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics))) #self.v = numpy.matrix(numpy.zeros((0, self.numTopics))) self.v = None # do the actual work -- perform iterative singular value decomposition # this is done by sequentially updating SVD with `chunks` new documents chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks) for chunkNo, (key, group) in enumerate(chunker): # convert the chunk of documents to vectors docs = [matutils.doc2vec(doc, self.numTerms) for docNo, doc in group] # self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents self.svdAddCols(docs, reorth = False) logging.info("processed documents up to #%s" % docNo) # calculate projection needed to get document-topic matrix from term-document matrix. # # the way to represent a vector `x` in latent space is lsi[x] = v = self.s^-1 * self.u^-1 * x, # so the projection is self.s^-1 * self.u^-1. # # the way to compare two documents `x1`, `x2` is to compute v1 * self.s^2 * v2.T, so # we pre-multiply v * s (ie., scale axes by singular values), and return # that directly as the representation of `x` in LSI space. # # this conveniently simplifies to lsi[x] = self.u.T * x, so the projection is # just self.u.T # # note that neither `v` (the right singular vectors) nor `s` (the singular # values) are used at all in the transformation self.projection = self.u.T if not keepDecomposition: # once we have the projection stored in self, discard u*s*v decomposition to free up memory del self.u, self.v
def __getitem__(self, bow): """ Return topic distribution, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ vec = matutils.doc2vec(bow, self.numTerms) vec.shape = (self.numTerms, 1) topicDist = self.projection * vec return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]
def testTransform(self): # create the transformation model model = ldamodel.LdaModel(self.corpus, numTopics = 2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.0, 1.0] self.assertTrue(numpy.allclose(sorted(vec), sorted(expected))) # must contain the same values, up to re-ordering
def testTransform(self): # create the transformation model model = lsimodel.LsiModel(self.corpus, numTopics = 2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests expected = [0.1973928, 0.05591352] self.assertTrue(numpy.allclose(abs(vec), expected)) # transformed entries must be equal up to sign
def initialize(self, corpus, chunks=100, keepDecomposition=False): """ Run SVD decomposition on the corpus. This will define the latent space into which terms and documents will be mapped. The SVD is created incrementally, in blocks of `chunks` documents. The algorithm is adapted from: M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition """ if self.id2word is None: logging.info( "no word id mapping provided; initializing from corpus, assuming identity" ) maxId = -1 for document in corpus: maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document])) self.numTerms = 1 + maxId self.id2word = dict( zip(xrange(self.numTerms), xrange(self.numTerms))) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # initialize decomposition (zero documents so far) self.u = numpy.matrix(numpy.zeros( (self.numTerms, self.numTopics))) # leave default numeric type (=double) self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics))) #self.v = numpy.matrix(numpy.zeros((0, self.numTopics))) self.v = None # do the actual work -- perform iterative singular value decomposition # this is done by sequentially updating SVD with `chunks` new documents chunker = itertools.groupby(enumerate(corpus), key=lambda val: val[0] / chunks) for chunkNo, (key, group) in enumerate(chunker): # convert the chunk of documents to vectors docs = [ matutils.doc2vec(doc, self.numTerms) for docNo, doc in group ] # self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents self.svdAddCols(docs, reorth=False) logging.info("processed documents up to #%s" % docNo) # calculate projection needed to get document-topic matrix from term-document matrix. # note that v (topics of the training corpus) are not used at all for the transformation invS = numpy.diag(numpy.diag(1.0 / self.s)) self.projection = numpy.dot( invS, self.u.T) # s^-1 * u^-1; (k, k) * (k, m) = (k, m) if keepDecomposition: # once we have the projection stored in self, discard u*s*v decomposition to free up memory del self.u, self.s, self.v
def testTransform(self): # create the transformation model model = lsimodel.LsiModel(self.corpus, numTopics = 2) # transform one document doc = list(self.corpus)[0] transformed = model[doc] vec = matutils.doc2vec(transformed, 2) # convert to dense vector, for easier equality tests expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
def initialize(self, corpus, chunks = 100, keepDecomposition = False): """ Run SVD decomposition on the corpus. This will define the latent space into which terms and documents will be mapped. The SVD is created incrementally, in blocks of `chunks` documents. The algorithm is adapted from: M. Brand. 2006. Fast low-rank modifications of the thin singular value decomposition """ if self.id2word is None: logging.info("no word id mapping provided; initializing from corpus, assuming identity") maxId = -1 for document in corpus: maxId = max(maxId, max([-1] + [fieldId for fieldId, _ in document])) self.numTerms = 1 + maxId self.id2word = dict(zip(xrange(self.numTerms), xrange(self.numTerms))) else: self.numTerms = 1 + max([-1] + self.id2word.keys()) # initialize decomposition (zero documents so far) self.u = numpy.matrix(numpy.zeros((self.numTerms, self.numTopics))) # leave default numeric type (=double) self.s = numpy.matrix(numpy.zeros((self.numTopics, self.numTopics))) #self.v = numpy.matrix(numpy.zeros((0, self.numTopics))) self.v = None # do the actual work -- perform iterative singular value decomposition # this is done by sequentially updating SVD with `chunks` new documents chunker = itertools.groupby(enumerate(corpus), key = lambda val: val[0] / chunks) for chunkNo, (key, group) in enumerate(chunker): # convert the chunk of documents to vectors docs = [matutils.doc2vec(doc, self.numTerms) for docNo, doc in group] # self.svdAddCols(docs, reorth = chunkNo % 100 == 99) # reorthogonalize once in every "100*chunks" documents self.svdAddCols(docs, reorth = False) logging.info("processed documents up to #%s" % docNo) # calculate projection needed to get document-topic matrix from term-document matrix. # note that v (topics of the training corpus) are not used at all for the transformation invS = numpy.diag(numpy.diag(1.0 / self.s)) self.projection = numpy.dot(invS, self.u.T) # s^-1 * u^-1; (k, k) * (k, m) = (k, m) if keepDecomposition: # once we have the projection stored in self, discard u*s*v decomposition to free up memory del self.u, self.s, self.v
def __getitem__(self, bow, scaled = True): """ Return latent distribution, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. Note that this function returns the latent space representation **scaled by the singular values**. To return non-scaled embedding, set `scaled` to False. """ # if the input vector is in fact a corpus, return a transformed corpus as result if utils.isCorpus(bow): return self._apply(bow) vec = matutils.doc2vec(bow, self.numTerms) vec.shape = (self.numTerms, 1) topicDist = self.projection * vec if not scaled: topicDist = numpy.diag(numpy.diag(1.0 / self.s)) * topicDist return [(topicId, float(topicValue)) for topicId, topicValue in enumerate(topicDist) if numpy.isfinite(topicValue) and not numpy.allclose(topicValue, 0.0)]