def test_simple_lists_of_tuples(self): # test list words # one document, one word potentialCorpus = [[(0, 4.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) # one document, several words potentialCorpus = [[(0, 4.), (1, 2.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) # several documents, one word potentialCorpus = [[(0, 4.)], [(1, 2.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result) potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]] result = utils.is_corpus(potentialCorpus) expected = (True, potentialCorpus) self.assertEqual(expected, result)
def __getitem__(self, bow, eps=1e-12): """ Return esa representation of the input vector and/or corpus. bow should already be weights, e.g. with TF-IDF """ # if the input vector is in fact a corpus, return a transformed corpus # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) #use corpus as interpreter matrix #simply multiply feature vector of input with corpus matrix #to get the weight of the concept vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus) #normalize vector = matutils.unitvec(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(concept_id, weight) for concept_id, weight in enumerate(vector) if abs(weight) > eps] return vector
def __getitem__(self, bow): """Get log entropy representation of the input vector and/or corpus. Parameters ---------- bow : list of (int, int) Document in BoW format. Returns ------- list of (int, float) Log-entropy vector for passed `bow`. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [ (term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr ] if self.normalize: vector = matutils.unitvec(vector) return vector
def top_topics(self, corpus, num_words=20): """ Calculate the Umass topic coherence for each topic. Algorithm from **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.** """ is_corpus, corpus = utils.is_corpus(corpus) if not is_corpus: logger.warning("LdaModel.top_topics() called with an empty corpus") return topics = [] str_topics = [] for topic in self.state.get_lambda(): topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn=num_words, reverse=True) topics.append(bestn) beststr = [(topic[id], self.id2word[id]) for id in bestn] str_topics.append(beststr) # top_ids are limited to every topics top words. should not exceed the # vocabulary size. top_ids = set(chain.from_iterable(topics)) # create a document occurence sparse matrix for each word doc_word_list = {} for id in top_ids: id_list = set() for n, document in enumerate(corpus): if id in frozenset(x[0] for x in document): id_list.add(n) doc_word_list[id] = id_list coherence_scores = [] for t, top_words in enumerate(topics): # Calculate each coherence score C(t, top_words) coherence = 0.0 # Sum of top words m=2..M for m in top_words[1:]: # m_docs is v_m^(t) m_docs = doc_word_list[m] # Sum of top words l=1..m-1 # i.e., all words ranked higher than the current word m for l in top_words[:m - 1]: # l_docs is v_l^(t) l_docs = doc_word_list[l] # make sure this word appears in some documents. if len(l_docs) > 0: # co_doc_frequency is D(v_m^(t), v_l^(t)) co_doc_frequency = len(m_docs.intersection(l_docs)) # add to the coherence sum for these two words m, l coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs)) coherence_scores.append((str_topics[t], coherence)) top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True) return top_topics
def get_similarities(self, query): """Get similarity between `query` and this index. Warnings -------- Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix. """ if not self.corpus: return numpy.array() is_corpus, query = utils.is_corpus(query) if not is_corpus and isinstance(query, numpy.ndarray): query = [self.corpus[i] for i in query] # convert document indexes to actual documents result = self.similarity_matrix.inner_product(query, self.corpus, normalized=True) if scipy.sparse.issparse(result): return numpy.asarray(result.todense()) if numpy.isscalar(result): return numpy.array(result) return numpy.asarray(result)[0]
def __getitem__(self, bow, eps=1e-12): """ Return esa representation of the input vector and/or corpus. bow should already be weights, e.g. with TF-IDF """ # if the input vector is in fact a corpus, return a transformed corpus # as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # use similarity index to calculate similarity with each vector of corpus vector = self.similarity_index[bow] # consine similarity is in [-1, 1] shift and scale to make it [0, 1] vector += 1 vector /= 2 #normalize vector = matutils.unitvec(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(concept_id, weight) for concept_id, weight in enumerate(vector) if abs(weight) > eps] return vector
def _getbow(self, doc): # if doc is an iterable apply to all is_corpus, doc = utils.is_corpus(doc) if is_corpus: return SimpleCorpus(self._apply(doc)) return self.dict.doc2bow(doc, allow_update=True)
def __getitem__(self, bow, iterations=100): """Get vector for document(s). Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Document (or corpus) in BoW format. iterations : int, optional Number of iterations that will be used for inferring. Returns ------- list of (int, float) LDA vector for document as sequence of (topic_id, topic_probability) **OR** list of list of (int, float) LDA vectors for corpus in same format. """ is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = \ self.mallet_path + ' infer-topics --input %s --inferencer %s ' \ '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % ( self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold ) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) return result if is_corpus else result[0]
def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): """ Return topic distribution for the given document `bow`, as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `minimum_probability`). If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word. It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count) """ if minimum_probability is None: minimum_probability = self.minimum_probability minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output if minimum_phi_value is None: minimum_phi_value = self.minimum_probability minimum_phi_value = max(minimum_phi_value, 1e-8) # never allow zero values in sparse output # if the input vector is a corpus, return a transformed corpus is_corpus, corpus = utils.is_corpus(bow) if is_corpus: kwargs = dict( per_word_topics=per_word_topics, minimum_probability=minimum_probability, minimum_phi_value=minimum_phi_value ) return self._apply(corpus, **kwargs) gamma, phis = self.inference([bow], collect_sstats=per_word_topics) topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution document_topics = [ (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= minimum_probability ] if not per_word_topics: return document_topics else: word_topic = [] # contains word and corresponding topic word_phi = [] # contains word and phi values for word_type, weight in bow: phi_values = [] # contains (phi_value, topic) pairing to later be sorted phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user for topic_id in range(0, self.num_topics): if phis[topic_id][word_type] >= minimum_phi_value: # appends phi values for each topic for that word # these phi values are scaled by feature length phi_values.append((phis[topic_id][word_type], topic_id)) phi_topic.append((topic_id, phis[topic_id][word_type])) # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]). word_phi.append((word_type, phi_topic)) # sorts the topics based on most likely topic # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]). sorted_phi_values = sorted(phi_values, reverse=True) topics_sorted = [x[1] for x in sorted_phi_values] word_topic.append((word_type, topics_sorted)) return (document_topics, word_topic, word_phi) # returns 2-tuple
def __getitem__(self, bow, eps=0.01): """Convert document or corpus in BoW format to LDA vectors in BoW format Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. eps : float Threshold value (all topics with probability < `eps` will be ignored. Returns ------- list of (int, float) LDA vector for document **OR** list of list of (int, float) LDA vectors for corpus. """ is_corpus, dummy_corpus = utils.is_corpus(bow) if not is_corpus: bow = [bow] predictions = self._predict(bow)[0] topics = [] for row in predictions: row_topics = [] for topic_id, val in enumerate(row): if val > eps: row_topics.append((topic_id, val)) topics.append(row_topics) return topics if is_corpus else topics[0]
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform 256 documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # 256 smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] vec = matutils.corpus2csc(bow, num_terms=self.num_terms) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def get_similarities(self, query): """ Return similarity of sparse vector `query` to all documents in the corpus, as a numpy array. If `query` is a collection of documents, return a 2D array of similarities of each document in `query` to all documents in the corpus (=batch query, faster than processing each document in turn). **Do not use this function directly; use the self[query] syntax instead.** """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.T # convert documents=rows to documents=columns elif isinstance(query, numpy.ndarray): if query.ndim == 1: query.shape = (1, len(query)) query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T else: # default case: query is a single vector, in sparse gensim format query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype) # compute cosine similarity against every other document in the collection result = self.index * query.tocsc() # N x T * T x C = N x C if result.shape[1] == 1: # for queries of one document, return a 1d array result = result.toarray().flatten() else: # otherwise, return a 2d matrix (#queries x #index) result = result.toarray().T return result
def __getitem__(self, bow, chunksize=10000): #ln.debug("getitem: %s" % chunksize) is_corpus, bow = utils.is_corpus(bow) if not is_corpus: bow = [bow] ln.info("Computing hidden representation for %s documents..." % len(bow)) if not chunksize: # todo I think could be removed altogether chunksize = 1 def transformed_corpus(): for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)): ln.debug("Converting chunk %s to csc format.." % chunk_no) chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality) ln.debug("Computing hidden representation for chunk.. ") hidden = self._get_hidden_representations(chunk) ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" % (chunk_no, chunk_no * chunksize + len(doc_chunk))) for column in hidden.T: yield matutils.dense2vec(column.T) ln.debug("Done yielding chunk %s" % chunk_no) ln.info("Finished computing representations for all chunks.") if not is_corpus: res = list(transformed_corpus()).pop() return res else: return transformed_corpus()
def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000): is_corpus, current_representation = utils.is_corpus(current_representation) if is_corpus: for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)): ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk)) assert num_terms is not None, "Need num_terms to properly handle sparse corpus format" chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms) ln.debug("Chunk converted to csc, running through layer..") chunk_trans = layer.__getitem__(chunk_as_csc) ln.debug("Serializing hidden representation..") fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no)) np.save(fname, chunk_trans) ln.debug("Finished serializing chunk. Processed %s documents so far." % (chunk_no * chunksize + len(chunk))) else: ln.info("Beginning serialization of non-gensim corpus format intermediate representation.") ln.debug("Type of current_representation is %s" % type(current_representation)) for chunk_no, chunk in enumerate(current_representation): ln.debug("converting chunk (%s documents)..." % chunksize) chunk_trans = layer.__getitem__(chunk) ln.debug("Serializing hidden representation..") fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no)) np.save(fname, chunk_trans) ln.debug("finished serializing chunk.") ln.info("Finished serializing all chunks.")
def get_similarities(self, query): """ Return similarity of sparse vector `query` to all documents in the corpus, as a numpy array. If `query` is a collection of documents, return a 2D array of similarities of each document in `query` to all documents in the corpus (=batch query, faster than processing each document in turn). **Do not use this function directly; use the self[query] syntax instead.** """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = numpy.asarray( [matutils.sparse2full(vec, self.num_features) for vec in query], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.toarray() # convert sparse to dense elif isinstance(query, numpy.ndarray): pass else: # default case: query is a single vector in sparse gensim format query = matutils.sparse2full(query, self.num_features) query = numpy.asarray(query, dtype=self.index.dtype) # do a little transposition dance to stop numpy from making a copy of # self.index internally in numpy.dot (very slow). result = numpy.dot(self.index, query.T).T # return #queries x #index return result # XXX: removed casting the result from array to list; does anyone care?
def __getitem__(self, bow, eps=1e-12): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [ (termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0 ] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.normalize is True: vector = matutils.unitvec(vector) elif self.normalize: vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. This batch query is more efficient than computing the similarities one document after another. """ self.close_shard() # no-op if no documents added to index since last query results = [] for shard in self.shards: shard.num_best = self.num_best shard.normalize = self.normalize results.append(shard[query]) if self.num_best is None: return numpy.hstack(results) # only top-n most similars requested; merge the partial results from all shards is_corpus, results = utils.is_corpus(results) if is_corpus: # query = single document? result = sorted(sum(results, []), key=lambda item: -item[1])[ : self.num_best] else: result = [] for parts in itertools.izip(*results): merged = sorted(sum(parts, []), key=lambda item: -item[1])[ : self.num_best] result.append(merged) return result
def __getitem__(self, bow, eps=0.01): is_corpus, corpus = utils.is_corpus(bow) if is_corpus: return self._apply(corpus) gamma, _ = self.inference([bow]) topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
def __getitem__(self, bow, eps=0.01): is_corpus, corpus = utils.is_corpus(bow) if is_corpus: return self._apply(corpus) gamma = self.inference([bow])[0] topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else [] return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps]
def __getitem__(self, doc): # if doc is an iterable apply to all is_corpus, doc = utils.is_corpus(doc) if is_corpus: return self._apply(doc) # return transformed doc according to function return self.funct(doc, *self.fargs, **self.fkwargs)
def top_topics(self, corpus, num_words=20): """ Calculate the Umass topic coherence for each topic. Algorithm from **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.** """ is_corpus, corpus = utils.is_corpus(corpus) if not is_corpus: logger.warning("LdaModel.top_topics() called with an empty corpus") return topics = [] str_topics = [] for topic in self.state.get_lambda(): topic = topic / topic.sum() # normalize to probability distribution bestn = matutils.argsort(topic, topn=num_words, reverse=True) topics.append(bestn) beststr = [(topic[id], self.id2word[id]) for id in bestn] str_topics.append(beststr) # top_ids are limited to every topics top words. should not exceed the # vocabulary size. top_ids = set(chain.from_iterable(topics)) # create a document occurence sparse matrix for each word doc_word_list = {} for id in top_ids: id_list = set() for n, document in enumerate(corpus): if id in frozenset(x[0] for x in document): id_list.add(n) doc_word_list[id] = id_list coherence_scores = [] for t, top_words in enumerate(topics): # Calculate each coherence score C(t, top_words) coherence = 0.0 # Sum of top words m=2..M for m in top_words[1:]: # m_docs is v_m^(t) m_docs = doc_word_list[m] # Sum of top words l=1..m-1 # i.e., all words ranked higher than the current word m for l in top_words[:m - 1]: # l_docs is v_l^(t) l_docs = doc_word_list[l] # co_doc_frequency is D(v_m^(t), v_l^(t)) co_doc_frequency = len(m_docs.intersection(l_docs)) # add to the coherence sum for these two words m, l coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs)) coherence_scores.append((str_topics[t], coherence)) top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True) return top_topics
def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False): """ Return topic distribution for the given document `bow`, as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `minimum_probability`). If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word. It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count) """ if minimum_probability is None: minimum_probability = self.minimum_probability minimum_probability = max(minimum_probability, 1e-8) # never allow zero values in sparse output if minimum_phi_value is None: minimum_phi_value = self.minimum_probability minimum_phi_value = max(minimum_phi_value, 1e-8) # never allow zero values in sparse output # if the input vector is a corpus, return a transformed corpus is_corpus, corpus = utils.is_corpus(bow) if is_corpus: kwargs = dict( per_word_topics = per_word_topics, minimum_probability = minimum_probability, minimum_phi_value = minimum_phi_value ) return self._apply(corpus, **kwargs) gamma, phis = self.inference([bow], collect_sstats=True) topic_dist = gamma[0] / sum(gamma[0]) # normalize distribution document_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= minimum_probability] if not per_word_topics: return document_topics else: word_topic = [] # contains word and corresponding topic word_phi = [] # contains word and phi values for word_type, weight in bow: phi_values = [] # contains (phi_value, topic) pairing to later be sorted phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user for topic_id in range(0, self.num_topics): if phis[topic_id][word_type] >= minimum_phi_value: # appends phi values for each topic for that word # these phi values are scaled by feature length phi_values.append((phis[topic_id][word_type], topic_id)) phi_topic.append((topic_id, phis[topic_id][word_type])) # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]). word_phi.append((word_type, phi_topic)) # sorts the topics based on most likely topic # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]). sorted_phi_values = sorted(phi_values, reverse=True) topics_sorted = [x[1] for x in sorted_phi_values] word_topic.append((word_type, topics_sorted)) return (document_topics, word_topic, word_phi) # returns 2-tuple
def __getitem__(self, bow, eps=1e-12): """Get the tf-idf representation of an input vector and/or corpus. bow : {list of (int, int), iterable of iterable of (int, int)} Input document in the `sparse Gensim bag-of-words format <https://radimrehurek.com/gensim/intro.html#core-concepts>`_, or a streamed corpus of such documents. eps : float Threshold value, will remove all position that have tfidf-value less than `eps`. Returns ------- vector : list of (int, float) TfIdf vector, if `bow` is a single document :class:`~gensim.interfaces.TransformedCorpus` TfIdf corpus, if `bow` is a corpus. """ self.eps = eps # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) termid_array, tf_array = [], [] for termid, tf in bow: termid_array.append(termid) tf_array.append(tf) tf_array = self.wlocal(np.array(tf_array)) vector = [ (termid, tf * self.idfs.get(termid)) for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps ] if self.normalize is True: self.normalize = matutils.unitvec elif self.normalize is False: self.normalize = utils.identity # and finally, normalize the vector either to unit length, or use a # user-defined normalization function if self.pivot is None: norm_vector = self.normalize(vector) norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps] else: _, old_norm = self.normalize(vector, return_norm=True) pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm norm_vector = [ (termid, weight / float(pivoted_norm)) for termid, weight in vector if abs(weight / float(pivoted_norm)) > self.eps ] return norm_vector
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. If `scaled` is set, scale topics by the inverse of singular values (default: no scaling). """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = ( vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec) # # use np's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype)) if not is_corpus: # convert back from matrix into a 1d vec topic_dist = topic_dist.reshape(-1) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics] ) * topic_dist # s^-1 * u^-1 * x # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def test_getitem_dense2gensim(self): corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100, dim=self.dim, sparse_serialization=False, gensim=True) item = corpus[3] self.assertTrue(isinstance(item, list)) self.assertTrue(isinstance(item[0], tuple)) dslice = corpus[2:6] self.assertTrue(next(dslice) == corpus[2]) dslice = list(dslice) self.assertTrue(isinstance(dslice, list)) self.assertTrue(isinstance(dslice[0], list)) self.assertTrue(isinstance(dslice[0][0], tuple)) iscorp, _ = is_corpus(dslice) self.assertTrue(iscorp, "Is the object returned by slice notation " "a gensim corpus?") ilist = corpus[[2, 3, 4, 5]] self.assertTrue(next(ilist) == corpus[2]) ilist = list(ilist) self.assertTrue(isinstance(ilist, list)) self.assertTrue(isinstance(ilist[0], list)) self.assertTrue(isinstance(ilist[0][0], tuple)) # From generators to lists self.assertEqual(len(ilist), len(dslice)) for i in xrange(len(ilist)): self.assertEqual(len(ilist[i]), len(dslice[i]), "Row %d: dims %d/%d" % (i, len(ilist[i]), len(dslice[i]))) for j in xrange(len(ilist[i])): self.assertEqual(ilist[i][j], dslice[i][j], "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % ( i, j, str(ilist[i][j]), i, j, str(dslice[i][j]))) iscorp, _ = is_corpus(ilist) self.assertTrue(iscorp, "Is the object returned by list notation " "a gensim corpus?")
def __getitem__(self, doc): # if doc is an iterable apply to all is_corpus, doc = utils.is_corpus(doc) if is_corpus: return self._apply(doc) self.counter.update(doc) return doc
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. If `scaled` is set, scale topics by the inverse of singular values (default: no scaling). """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), # but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec) # # use np's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype)) if not is_corpus: # convert back from matrix into a 1d vec topic_dist = topic_dist.reshape(-1) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def __getitem__(self, doc): # if doc is an iterable apply to all is_corpus, doc = utils.is_corpus(doc) if is_corpus: return self._apply(doc) # appling transformation, return doc as a bag-of-bitokens list allow_update = False if len(self.bidict) > 0 else True return self.bidict.doc2bob(doc, allow_update)
def __getitem__(self, bow, eps=0.01): is_corpus, corpus = utils.is_corpus(bow) if is_corpus: return self._apply(corpus) gamma, _ = self.inference([bow]) topic_dist = gamma[0] / sum( gamma[0]) # normalize to proper distribution return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
def __getitem__(self, item): iscorpus, _ = is_corpus(item) if iscorpus or isinstance(item, DatasetABC): return self._apply(item) else: raise ValueError('Cannot apply flatten_composite to individual ' 'documents.')
def __getitem__(self, vec, eps=1e-12): is_corpus, vec = utils.is_corpus(vec) if is_corpus: return self._apply(vec) if self.L1: score = sum(v for _, v in vec) / len(vec) if vec else 0 else: score = sum(v * v for _, v in vec) / len(vec) if vec else 0 return score
def __getitem__(self, items): is_corpus, items = utils.is_corpus(items) if not is_corpus: v = self._get_vector_representation(items) return sparse2full(v, self.size) else: return list( map(lambda v: sparse2full(v, self.size), self._get_vector_representation(items)))
def __getitem__(self, bow): """ Return representation with the ids transformed. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
def __getitem__(self, vec, eps=1e-12): is_corpus, vec = utils.is_corpus(vec) if is_corpus: return self._apply(vec) if self.L1: score = sum( v for _, v in vec) / len(vec) if vec else 0 else: score = sum(v*v for _, v in vec) / len(vec) if vec else 0 return score
def __getitem__(self, item): iscorpus, _ = is_corpus(item) if iscorpus: return self._apply(item) else: #raise ValueError('Cannot apply serializer to individual documents.') # Will this work? return self.serialized_data[item]
def top_topics(self, corpus, num_topics=5, num_words=20): """ Calculate the Umass topic coherence for each topic and return the top num_topics. Algorithm from **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.** """ if num_topics < 0 or num_topics >= self.num_topics: if self.num_topics >= 5: num_topics = 5 else: num_topics = self.num_topics logger.warning("num_topics out of range - setting to default of 5") is_corpus, corpus = utils.is_corpus(corpus) if not is_corpus: logger.warning("LdaModel.top_topics() called with an empty corpus") return coherence_scores = [] topics = [] str_topics = [] for topic in self.state.get_lambda(): topic = topic / topic.sum() # normalize to probability dist bestn = np.argsort(topic)[::-1][:num_words] topics.append(bestn) beststr = [(topic[id], self.id2word[id]) for id in bestn] str_topics.append(beststr) top_id = chain.from_iterable(topics) top_id = list(set(top_id)) doc_word_list = {} for id in top_id: id_list = [] for document in range(len(corpus)): if len(list(ifilter(lambda x: x[0] == id, corpus[document]))) > 0: id_list.append(document) if len(id_list) > 0: doc_word_list[id] = id_list for topic in xrange(len(topics)): topic_coherence_sum = 0.0 for word_m in topics[topic][1:]: doc_frequency_m = len(doc_word_list[word_m]) m_set = set(doc_word_list[word_m]) for word_l in topics[topic][:-1]: l_set = set(doc_word_list[word_l]) co_doc_frequency = len(m_set.intersection(l_set)) topic_coherence_sum += numpy.log( (co_doc_frequency + 1.0) / doc_frequency_m) coherence_scores.append((str_topics[topic], topic_coherence_sum)) top_topics = sorted(coherence_scores, key=lambda tup: tup[1], reverse=True)[0:num_topics - 1] return top_topics
def __getitem__(self, bow, iterations=100): is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: bow = [bow] self.convert_input(bow, infer=True) cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s" cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations) logger.info("inferring with Mallet LDA with %s" % cmd) call(cmd, shell=True) return list(read_doctopics(self.fdoctopics() + '.infer'))
def __getitem__(self, bow): """Get random-projection representation of the input vector or corpus. Parameters ---------- bow : {list of (int, int), iterable of list of (int, int)} Input document or corpus. Returns ------- list of (int, float) if `bow` is document OR :class:`~gensim.interfaces.TransformedCorpus` if `bow` is corpus. Examples ---------- .. sourcecode:: pycon >>> from gensim.models import RpModel >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import common_texts >>> >>> dictionary = Dictionary(common_texts) # fit dictionary >>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format >>> >>> model = RpModel(corpus, id2word=dictionary) # fit model >>> >>> # apply model to document, result is vector in BoW format, i.e. [(1, 0.3), ... ] >>> result = model[corpus[0]] """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) if getattr(self, 'freshly_loaded', False): # This is a hack to work around a bug in np, where a FORTRAN-order array # unpickled from disk segfaults on using it. self.freshly_loaded = False self.projection = self.projection.copy( 'F') # simply making a fresh copy fixes the broken array vec = matutils.sparse2full(bow, self.num_terms).reshape( self.num_terms, 1) / np.sqrt(self.num_topics) vec = np.asfortranarray(vec, dtype=np.float32) topic_dist = np.dot(self.projection, vec) # (k, d) * (d, 1) = (k, 1) return [ (topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) if np.isfinite(topicvalue) and not np.allclose(topicvalue, 0.0) ]
def __getitem__(self, items): """ Return random vector(s). :param items: :return: """ is_corpus, items = utils.is_corpus(items) if not is_corpus: return np.random.random(self.size) else: return list(map(lambda v: np.random.random(self.size), items))
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. This batch query is more efficient than computing the similarities one document after another. """ self.close_shard() # no-op if no documents added to index since last query # reset num_best and normalize parameters, in case they were changed dynamically for shard in self.shards: shard.num_best = self.num_best shard.normalize = self.norm # there are 4 distinct code paths, depending on whether input `query` is # a corpus (or numpy/scipy matrix) or a single document, and whether the # similarity result should be a full array or only num_best most similar # documents. pool, shard_results = self.query_shards(query) if self.num_best is None: # user asked for all documents => just stack the sub-results into a single matrix # (works for both corpus / single doc query) result = numpy.hstack(shard_results) else: # the following uses a lot of lazy evaluation and (optionally) parallel # processing, to improve query latency and minimize memory footprint. offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc] is_corpus, query = utils.is_corpus(query) is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 if not is_corpus: # user asked for num_best most similar and query is a single doc results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results)) result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) else: # the trickiest combination: returning num_best results when query was a corpus results = [] for shard_no, result in enumerate(shard_results): shard_result = [convert(doc, shard_no) for doc in result] results.append(shard_result) result = [] for parts in izip(*results): merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1]) result.append(merged) if pool: # gc doesn't seem to collect the Pools, eventually leading to # "IOError 24: too many open files". so let's terminate it manually. pool.terminate() return result
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. This batch query is more efficient than computing the similarities one document after another. """ self.close_shard( ) # no-op if no documents added to index since last query # reset num_best and normalize parameters, in case they were changed dynamically for shard in self.shards: shard.num_best = self.num_best shard.normalize = self.normalize # there are 4 distinct code paths, depending on whether input `query` is # a corpus (or numpy/scipy matrix) or a single document, and whether the # similarity result is a full array or only num_best most similar documents. if self.num_best is None: # user asked for all documents => just stack the sub-results into a single matrix # (works for both corpus / single doc query) return numpy.hstack(shard[query] for shard in self.shards) offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards]) convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc] is_corpus, query = utils.is_corpus(query) is_corpus = is_corpus or hasattr( query, 'ndim') and query.ndim > 1 and query.shape[0] > 1 if not is_corpus: # user asked for num_best most similar and query is a single doc results = (convert(shard[query], shard_no) for shard_no, shard in enumerate(self.shards)) return heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1]) # the trickiest combination: returning num_best results when query was a corpus shard_results = [] for shard_no, shard in enumerate(self.shards): shard_result = [convert(doc, shard_no) for doc in shard[query]] shard_results.append(shard_result) result = [] for parts in itertools.izip(*shard_results): merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1]) result.append(merged) return result
def __getitem__(self, bow, iterations=100): is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s' cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold) logger.info("inferring topics with MALLET LDA '%s'", cmd) check_output(args=cmd, shell=True) result = list(self.read_doctopics(self.fdoctopics() + '.infer')) return result if is_corpus else result[0]
def __getitem__(self, query): """Get access to similarities of document/corpus `query` to all documents in the corpus. Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities` Notes ----- Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way. Parameters ---------- query : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [ matutils.full2sparse_clipped(v, self.num_best) for v in result ] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get similarities of the given document or corpus against this index. Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally. Notes ----- Passing an entire corpus as `query` can be more efficient than passing its documents one after another, because it will issue queries in batches internally. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [ matutils.full2sparse_clipped(v, self.num_best) for v in result ] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def get_similarities(self, query): """Get similarity between `query` and this index. Warnings -------- Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`} Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix (if maintain_sparsity=False) **OR** :class:`scipy.sparse.csc` otherwise """ is_corpus, query = utils.is_corpus(query) if is_corpus: query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype) else: if scipy.sparse.issparse(query): query = query.T # convert documents=rows to documents=columns elif isinstance(query, numpy.ndarray): if query.ndim == 1: query.shape = (1, len(query)) query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T else: # default case: query is a single vector, in sparse gensim format query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype) # compute cosine similarity against every other document in the collection result = self.index * query.tocsc() # N x T * T x C = N x C if result.shape[1] == 1 and not is_corpus: # for queries of one document, return a 1d array result = result.toarray().flatten() elif self.maintain_sparsity: # avoid converting to dense array if maintaining sparsity result = result.T else: # otherwise, return a 2d matrix (#queries x #index) result = result.toarray().T return result
def __getitem__(self, bow): """ Return RP representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics) vec = numpy.asfortranarray(vec, dtype=numpy.float32) topic_dist = scipy.linalg.fblas.sgemv(1.0, self.projection, vec) # (k, d) * (d, 1) = (k, 1) return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat) if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
def test_invalid_formats(self): # test invalid formats # these are no corpus, because they do not consists of 2-tuples with # the form(int, float). potentials = list() potentials.append(["human"]) potentials.append("human") potentials.append(["human", "star"]) potentials.append([1, 2, 3, 4, 5, 5]) potentials.append([[(0, 'string')]]) for noCorpus in potentials: result = utils.is_corpus(noCorpus) expected = (False, noCorpus) self.assertEqual(expected, result)
def __getitem__(self, bow): """ Return log entropy representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge) vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id)) for term_id, tf in bow if term_id in self.entr] if self.normalize: vector = matutils.unitvec(vector) return vector
def __getitem__(self, bow, iterations=100): is_corpus, corpus = utils.is_corpus(bow) if not is_corpus: # query is a single document => make a corpus out of it bow = [bow] self.convert_input(bow, infer=True) cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %f" cmd = cmd % (self.fcorpusmallet(True) + '.infer', self.finferencer(True), self.fdoctopics(True) + '.infer', iterations, 1/(self.num_topics)) logger.info("inferring topics with MALLET LDA '%s'" % cmd) retval = call(cmd, shell=True) if retval != 0: raise RuntimeError("MALLET failed with error %s on return" % retval) result = list(gensim.models.wrappers.ldamallet.read_doctopics(self.fdoctopics(True) + '.infer')) return result if is_corpus else result[0]
def __getitem__(self, bow, eps=1e-12): """Get tf-idf representation of the input vector and/or corpus. bow : {list of (int, int), iterable of iterable of (int, int)} Input document or copus in BoW format. eps : float Threshold value, will remove all position that have tfidf-value less than `eps`. Returns ------- vector : list of (int, float) TfIdf vector, if `bow` is document **OR** :class:`~gensim.interfaces.TransformedCorpus` TfIdf corpus, if `bow` is corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) termid_array, tf_array = [], [] for termid, tf in bow: termid_array.append(termid) tf_array.append(tf) tf_array = self.wlocal(np.array(tf_array)) vector = [(termid, tf * self.idfs.get(termid)) for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > eps] if self.normalize is True: self.normalize = matutils.unitvec elif self.normalize is False: self.normalize = utils.identity # and finally, normalize the vector either to unit length, or use a # user-defined normalization function vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] return vector
def __getitem__(self, bow, eps=0.01): """ Return topic distribution for the given document `bow`, as a list of (topic_id, topic_probability) 2-tuples. Ignore topics with very low probability (below `eps`). """ # if the input vector is in fact a corpus, return a transformed corpus as result is_corpus, corpus = utils.is_corpus(bow) if is_corpus: return self._apply(corpus) gamma, _ = self.inference([bow]) topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps] # ignore document's topics that have prob < eps
def __getitem__(self, bow): """ Return tf-idf representation of the input vector and/or corpus. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [(termid, tf * self.idfs.get(termid, 0.0)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0] if self.normalize: vector = matutils.unitvec(vector) return vector
def __getitem__(self, bow, eps=0.01): is_corpus, dummy_corpus = utils.is_corpus(bow) if not is_corpus: bow = [bow] predictions = self._predict(bow)[0] topics = [] for row in predictions: row_topics = [] for topic_id, val in enumerate(row): if val > eps: row_topics.append((topic_id, val)) topics.append(row_topics) return topics if is_corpus else topics[0]
def get_document_topics(self, bow, minimum_probability=None, normalize=None): """Get the topic distribution for the given document. Parameters ---------- bow : list of (int, float) The document in BOW format. minimum_probability : float If `normalize` is True, topics with smaller probabilities are filtered out. If `normalize` is False, topics with smaller factors are filtered out. If set to None, a value of 1e-8 is used to prevent 0s. normalize: bool or None, optional Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c. Returns ------- list of (int, float) Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and the probability that was assigned to it. """ if minimum_probability is None: minimum_probability = self.minimum_probability minimum_probability = max(minimum_probability, 1e-8) # if the input vector is a corpus, return a transformed corpus is_corpus, corpus = utils.is_corpus(bow) if is_corpus: kwargs = dict(minimum_probability=minimum_probability) return self._apply(corpus, **kwargs) v = matutils.corpus2csc([bow], self.num_tokens) h = self._solveproj(v, self._W, v_max=np.inf) if normalize is None: normalize = self.normalize if normalize: the_sum = h.sum() if the_sum: h /= the_sum return [(idx, proba) for idx, proba in enumerate(h[:, 0]) if not minimum_probability or proba > minimum_probability]
def get_similarities(self, query): """Get similarity between `query` and this index. Warnings -------- Do not use this function directly; use the `self[query]` syntax instead. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document or collection of documents. Return ------ :class:`numpy.ndarray` Similarity matrix. """ is_corpus, query = utils.is_corpus(query) if not is_corpus: if isinstance(query, numpy.ndarray): # Convert document indexes to actual documents. query = [self.corpus[i] for i in query] else: query = [query] result = [] for query_document in query: # Compute similarity for each query. qresult = [ matutils.softcossim(query_document, corpus_document, self.similarity_matrix) for corpus_document in self.corpus ] qresult = numpy.array(qresult) # Append single query result to list of all results. result.append(qresult) if is_corpus: result = numpy.array(result) else: result = result[0] return result