def __getitem__(self, query): """Get similarities of the given document or corpus against this index. Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally. Notes ----- Passing an entire corpus as `query` can be more efficient than passing its documents one after another, because it will issue queries in batches internally. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [ matutils.full2sparse_clipped(v, self.num_best) for v in result ] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get access to similarities of document/corpus `query` to all documents in the corpus. Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities` Notes ----- Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way. Parameters ---------- query : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [ matutils.full2sparse_clipped(v, self.num_best) for v in result ] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get similarities of the given document or corpus against this index. Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally. Notes ----- Passing an entire corpus as `query` can be more efficient than passing its documents one after another, because it will issue queries in batches internally. Parameters ---------- query : {list of (int, number), iterable of list of (int, number)} Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get access to similarities of document/corpus `query` to all documents in the corpus. Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities` Notes ----- Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way. Parameters ---------- query : {list of (int, int), iterable of list of (int, int)} Document or corpus in BoW format. Returns ------- {`scipy.sparse.csr.csr_matrix`, list of (int, float)} Similarities given document or corpus and objects corpus, depends on `query`. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if not matutils.ismatrix(query): if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparsity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. Using this type of batch query is more efficient than computing the similarities one document after another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): import warnings # warnings.warn("non-gensim input must already come normalized") else: if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [ matutils.full2sparse_clipped(v, self.num_best) for v in result ] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def __getitem__(self, query): """Get similarities of document `query` to all documents in the corpus. **or** If `query` is a corpus (iterable of documents), return a matrix of similarities of all query documents vs. all corpus document. Using this type of batch query is more efficient than computing the similarities one document after another. """ is_corpus, query = utils.is_corpus(query) if self.normalize: # self.normalize only works if the input is a plain gensim vector/corpus (as # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix # as well, but in that case assume tricks are happening and don't normalize # anything (self.normalize has no effect). if matutils.ismatrix(query): import warnings # noqa:F401 # warnings.warn("non-gensim input must already come normalized") else: if is_corpus: query = [matutils.unitvec(v) for v in query] else: query = matutils.unitvec(query) result = self.get_similarities(query) if self.num_best is None: return result # if maintain_sparity is True, result is scipy sparse. Sort, clip the # topn and return as a scipy sparse matrix. if getattr(self, 'maintain_sparsity', False): return matutils.scipy2scipy_clipped(result, self.num_best) # if the input query was a corpus (=more documents), compute the top-n # most similar for each document in turn if matutils.ismatrix(result): return [matutils.full2sparse_clipped(v, self.num_best) for v in result] else: # otherwise, return top-n of the single input document return matutils.full2sparse_clipped(result, self.num_best)
def get_top_terms(vec, dict, n): """ Returns the top N elements of the greatest magnitude (abs) of a doc vector (in Gensim BoW or dense numpy array) """ if not (is_bow(vec)): top = matutils.full2sparse_clipped(vec, n) return list(map(lambda pair: ((dict[pair[0]], pair[0]), pair[1]), top)) else: return list( map(lambda pair: ((dict[pair[0]], pair[0]), pair[1]), sorted(vec, key=lambda pair: pair[1], reverse=True)[:n]))
def test_full2sparse_clipped(self): vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15] expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)] self.assertTrue(matutils.full2sparse_clipped(vec, topn=3), expected)