예제 #1
파일: test_utils.py 프로젝트: leahic/gensim
    def test_simple_lists_of_tuples(self):
        # test list words

        # one document, one word
        potentialCorpus = [[(0, 4.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # one document, several words
        potentialCorpus = [[(0, 4.), (1, 2.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.), (1, 2.), (2, 5.), (3, 8.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        # several documents, one word
        potentialCorpus = [[(0, 4.)], [(1, 2.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)

        potentialCorpus = [[(0, 4.)], [(1, 2.)], [(2, 5.)], [(3, 8.)]]
        result = utils.is_corpus(potentialCorpus)
        expected = (True, potentialCorpus)
        self.assertEqual(expected, result)
예제 #2
파일: esamodel.py 프로젝트: eric011/nyan
    def __getitem__(self, bow, eps=1e-12):
        Return esa representation of the input vector and/or corpus.
        bow should already be weights, e.g. with TF-IDF
        # if the input vector is in fact a corpus, return a transformed corpus 
        # as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        #use corpus as interpreter matrix
        #simply multiply feature vector of input with corpus matrix
        #to get the weight of the concept
        vector = numpy.dot(matutils.sparse2full(bow, self.num_features), self.corpus)

        vector = matutils.unitvec(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(concept_id, weight)
                  for concept_id, weight
                  in enumerate(vector)
                  if abs(weight) > eps]
        return vector
예제 #3
    def __getitem__(self, bow):
        """Get log entropy representation of the input vector and/or corpus.

        bow : list of (int, int)
            Document in BoW format.

        list of (int, float)
            Log-entropy vector for passed `bow`.

        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [
            (term_id, math.log(tf + 1) * self.entr.get(term_id))
            for term_id, tf in bow
            if term_id in self.entr
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector
예제 #4
파일: ldamodel.py 프로젝트: krishna11888/ai
    def top_topics(self, corpus, num_words=20):
        Calculate the Umass topic coherence for each topic. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")

        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum()  # normalize to probability distribution
            bestn = matutils.argsort(topic, topn=num_words, reverse=True)
            beststr = [(topic[id], self.id2word[id]) for id in bestn]

        # top_ids are limited to every topics top words. should not exceed the
        # vocabulary size.
        top_ids = set(chain.from_iterable(topics))

        # create a document occurence sparse matrix for each word
        doc_word_list = {}
        for id in top_ids:
            id_list = set()
            for n, document in enumerate(corpus):
                if id in frozenset(x[0] for x in document):

            doc_word_list[id] = id_list

        coherence_scores = []
        for t, top_words in enumerate(topics):
            # Calculate each coherence score C(t, top_words)
            coherence = 0.0
            # Sum of top words m=2..M
            for m in top_words[1:]:
                # m_docs is v_m^(t)
                m_docs = doc_word_list[m]

                # Sum of top words l=1..m-1
                # i.e., all words ranked higher than the current word m
                for l in top_words[:m - 1]:
                    # l_docs is v_l^(t)
                    l_docs = doc_word_list[l]

                    # make sure this word appears in some documents.
                    if len(l_docs) > 0:
                        # co_doc_frequency is D(v_m^(t), v_l^(t))
                        co_doc_frequency = len(m_docs.intersection(l_docs))

                        # add to the coherence sum for these two words m, l
                        coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs))

            coherence_scores.append((str_topics[t], coherence))

        top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
        return top_topics
예제 #5
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Do not use this function directly; use the `self[query]` syntax instead.

        query : {list of (int, number), iterable of list of (int, number)}
            Document or collection of documents.

            Similarity matrix.

        if not self.corpus:
            return numpy.array()

        is_corpus, query = utils.is_corpus(query)
        if not is_corpus and isinstance(query, numpy.ndarray):
            query = [self.corpus[i] for i in query]  # convert document indexes to actual documents
        result = self.similarity_matrix.inner_product(query, self.corpus, normalized=True)

        if scipy.sparse.issparse(result):
            return numpy.asarray(result.todense())
        if numpy.isscalar(result):
            return numpy.array(result)
        return numpy.asarray(result)[0]
    def __getitem__(self, bow, eps=1e-12):
        Return esa representation of the input vector and/or corpus.

        bow should already be weights, e.g. with TF-IDF
        # if the input vector is in fact a corpus, return a transformed corpus
        # as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # use similarity index to calculate similarity with each vector of corpus
        vector = self.similarity_index[bow]

        # consine similarity is in [-1, 1] shift and scale to make it [0, 1]
        vector += 1
        vector /= 2

        vector = matutils.unitvec(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(concept_id, weight)
                  for concept_id, weight
                  in enumerate(vector)
                  if abs(weight) > eps]
        return vector
예제 #7
    def _getbow(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return SimpleCorpus(self._apply(doc))

        return self.dict.doc2bow(doc, allow_update=True)
예제 #8
    def __getitem__(self, bow, iterations=100):
        """Get vector for document(s).

        bow : {list of (int, int), iterable of list of (int, int)}
            Document (or corpus) in BoW format.
        iterations : int, optional
            Number of iterations that will be used for inferring.

        list of (int, float)
            LDA vector for document as sequence of (topic_id, topic_probability) **OR**
        list of list of (int, float)
            LDA vectors for corpus in same format.

        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            # query is a single document => make a corpus out of it
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = \
            self.mallet_path + ' infer-topics --input %s --inferencer %s ' \
                               '--output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
        cmd = cmd % (
            self.fcorpusmallet() + '.infer', self.finferencer(),
            self.fdoctopics() + '.infer', iterations, self.topic_threshold
        logger.info("inferring topics with MALLET LDA '%s'", cmd)
        check_output(args=cmd, shell=True)
        result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
        return result if is_corpus else result[0]
예제 #9
파일: ldamodel.py 프로젝트: JKamlah/gensim
    def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `minimum_probability`).

        If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word.
        It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count)

        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output

        if minimum_phi_value is None:
            minimum_phi_value = self.minimum_probability
        minimum_phi_value = max(minimum_phi_value, 1e-8)  # never allow zero values in sparse output

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            kwargs = dict(
            return self._apply(corpus, **kwargs)

        gamma, phis = self.inference([bow], collect_sstats=per_word_topics)
        topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution

        document_topics = [
            (topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
            if topicvalue >= minimum_probability

        if not per_word_topics:
            return document_topics
            word_topic = []  # contains word and corresponding topic
            word_phi = []  # contains word and phi values
            for word_type, weight in bow:
                phi_values = []  # contains (phi_value, topic) pairing to later be sorted
                phi_topic = []  # contains topic and corresponding phi value to be returned 'raw' to user
                for topic_id in range(0, self.num_topics):
                    if phis[topic_id][word_type] >= minimum_phi_value:
                        # appends phi values for each topic for that word
                        # these phi values are scaled by feature length
                        phi_values.append((phis[topic_id][word_type], topic_id))
                        phi_topic.append((topic_id, phis[topic_id][word_type]))

                # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]).
                word_phi.append((word_type, phi_topic))
                # sorts the topics based on most likely topic
                # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
                sorted_phi_values = sorted(phi_values, reverse=True)
                topics_sorted = [x[1] for x in sorted_phi_values]
                word_topic.append((word_type, topics_sorted))
            return (document_topics, word_topic, word_phi)  # returns 2-tuple
예제 #10
    def __getitem__(self, bow, eps=0.01):
        """Convert document or corpus in BoW format to LDA vectors in BoW format

        bow : {list of (int, int), iterable of list of (int, int)}
            Document or corpus in BoW format.
        eps : float
            Threshold value (all topics with probability < `eps` will be ignored.

        list of (int, float)
            LDA vector for document **OR**
        list of list of (int, float)
            LDA vectors for corpus.

        is_corpus, dummy_corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        predictions = self._predict(bow)[0]

        topics = []
        for row in predictions:
            row_topics = []
            for topic_id, val in enumerate(row):
                if val > eps:
                    row_topics.append((topic_id, val))

        return topics if is_corpus else topics[0]
예제 #11
    def __getitem__(self, bow, scaled=False, chunksize=512):
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform 256 documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # 256 smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms)

        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x
        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x

        # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist.flat)
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
예제 #12
파일: docsim.py 프로젝트: Dieterbe/gensim
    def get_similarities(self, query):
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query, self.index.shape[1], dtype=self.index.dtype)
            if scipy.sparse.issparse(query):
                query = query.T # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query, dtype=self.index.dtype).T
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query], self.index.shape[1], dtype=self.index.dtype)

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc() # N x T * T x C = N x C
        if result.shape[1] == 1:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
예제 #13
    def __getitem__(self, bow, chunksize=10000):
        #ln.debug("getitem: %s" % chunksize)
        is_corpus, bow = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        ln.info("Computing hidden representation for %s documents..." % len(bow))

        if not chunksize:  # todo I think could be removed altogether
            chunksize = 1

        def transformed_corpus():
            for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)):
                ln.debug("Converting chunk %s to csc format.." % chunk_no)
                chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality)
                ln.debug("Computing hidden representation for chunk.. ")
                hidden = self._get_hidden_representations(chunk)
                ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" %
                        (chunk_no, chunk_no * chunksize + len(doc_chunk)))
                for column in hidden.T:
                    yield matutils.dense2vec(column.T)
                ln.debug("Done yielding chunk %s" % chunk_no)

            ln.info("Finished computing representations for all chunks.")

        if not is_corpus:
            res = list(transformed_corpus()).pop()
            return res
            return transformed_corpus()
예제 #14
    def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000):
        is_corpus, current_representation = utils.is_corpus(current_representation)
        if is_corpus:
            for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)):
                ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk))
                assert num_terms is not None, "Need num_terms to properly handle sparse corpus format"
                chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms)

                ln.debug("Chunk converted to csc, running through layer..")
                chunk_trans = layer.__getitem__(chunk_as_csc)

                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("Finished serializing chunk. Processed %s documents so far." %
                         (chunk_no * chunksize + len(chunk)))
            ln.info("Beginning serialization of non-gensim corpus format intermediate representation.")
            ln.debug("Type of current_representation is %s" % type(current_representation))
            for chunk_no, chunk in enumerate(current_representation):
                ln.debug("converting chunk (%s documents)..." % chunksize)
                chunk_trans = layer.__getitem__(chunk)
                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("finished serializing chunk.")

        ln.info("Finished serializing all chunks.")
예제 #15
파일: docsim.py 프로젝트: leahic/gensim
    def get_similarities(self, query):
        Return similarity of sparse vector `query` to all documents in the corpus,
        as a numpy array.

        If `query` is a collection of documents, return a 2D array of similarities
        of each document in `query` to all documents in the corpus (=batch query,
        faster than processing each document in turn).

        **Do not use this function directly; use the self[query] syntax instead.**

        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = numpy.asarray(
                [matutils.sparse2full(vec, self.num_features) for vec in query],
            if scipy.sparse.issparse(query):
                query = query.toarray()  # convert sparse to dense
            elif isinstance(query, numpy.ndarray):
                # default case: query is a single vector in sparse gensim format
                query = matutils.sparse2full(query, self.num_features)
            query = numpy.asarray(query, dtype=self.index.dtype)

        # do a little transposition dance to stop numpy from making a copy of
        # self.index internally in numpy.dot (very slow).
        result = numpy.dot(self.index, query.T).T  # return #queries x #index
        return result  # XXX: removed casting the result from array to list; does anyone care?
예제 #16
파일: tfidfmodel.py 프로젝트: nAk123/gensim
    def __getitem__(self, bow, eps=1e-12):
        Return tf-idf representation of the input vector and/or corpus.
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [
            (termid, self.wlocal(tf) * self.idfs.get(termid)) for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.normalize is True:
            vector = matutils.unitvec(vector)
        elif self.normalize:
            vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
        return vector
예제 #17
파일: docsim.py 프로젝트: Dieterbe/gensim
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.


        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        self.close_shard() # no-op if no documents added to index since last query

        results = []
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.normalize

        if self.num_best is None:
            return numpy.hstack(results)

        # only top-n most similars requested; merge the partial results from all shards
        is_corpus, results = utils.is_corpus(results)
        if is_corpus:
            # query = single document?
            result = sorted(sum(results, []), key=lambda item: -item[1])[ : self.num_best]
            result = []
            for parts in itertools.izip(*results):
                merged = sorted(sum(parts, []), key=lambda item: -item[1])[ : self.num_best]
        return result
예제 #18
파일: nlp.py 프로젝트: sangheestyle/stools
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution
        return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
예제 #19
파일: hdpmodel.py 프로젝트: abs51295/gensim
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma = self.inference([bow])[0]
        topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else []
        return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps]
예제 #20
파일: corpus2.py 프로젝트: lum4chi/mygensim
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)

        # return transformed doc according to function
        return self.funct(doc, *self.fargs, **self.fkwargs)
예제 #21
파일: ldamodel.py 프로젝트: polcar/gensim
    def top_topics(self, corpus, num_words=20):
        Calculate the Umass topic coherence for each topic. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")

        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum()  # normalize to probability distribution
            bestn = matutils.argsort(topic, topn=num_words, reverse=True)
            beststr = [(topic[id], self.id2word[id]) for id in bestn]

        # top_ids are limited to every topics top words. should not exceed the
        # vocabulary size.
        top_ids = set(chain.from_iterable(topics))

        # create a document occurence sparse matrix for each word
        doc_word_list = {}
        for id in top_ids:
            id_list = set()
            for n, document in enumerate(corpus):
                if id in frozenset(x[0] for x in document):

            doc_word_list[id] = id_list

        coherence_scores = []
        for t, top_words in enumerate(topics):
            # Calculate each coherence score C(t, top_words)
            coherence = 0.0
            # Sum of top words m=2..M
            for m in top_words[1:]:
                # m_docs is v_m^(t)
                m_docs = doc_word_list[m]

                # Sum of top words l=1..m-1
                # i.e., all words ranked higher than the current word m
                for l in top_words[:m - 1]:
                    # l_docs is v_l^(t)
                    l_docs = doc_word_list[l]

                    # co_doc_frequency is D(v_m^(t), v_l^(t))
                    co_doc_frequency = len(m_docs.intersection(l_docs))

                    # add to the coherence sum for these two words m, l
                    coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs))

            coherence_scores.append((str_topics[t], coherence))

        top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
        return top_topics
예제 #22
    def get_document_topics(self, bow, minimum_probability=None, minimum_phi_value=None, per_word_topics=False):
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `minimum_probability`).

        If per_word_topics is True, it also returns a list of topics, sorted in descending order of most likely topics for that word.
        It also returns a list of word_ids and each words corresponding topics' phi_values, multiplied by feature length (i.e, word count)

        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)  # never allow zero values in sparse output

        if minimum_phi_value is None:
            minimum_phi_value = self.minimum_probability
        minimum_phi_value = max(minimum_phi_value, 1e-8)  # never allow zero values in sparse output

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            kwargs = dict(
                per_word_topics = per_word_topics,
                minimum_probability = minimum_probability,
                minimum_phi_value = minimum_phi_value
            return self._apply(corpus, **kwargs)

        gamma, phis = self.inference([bow], collect_sstats=True)
        topic_dist = gamma[0] / sum(gamma[0])  # normalize distribution

        document_topics = [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
                    if topicvalue >= minimum_probability]

        if not per_word_topics:
            return document_topics
            word_topic = [] # contains word and corresponding topic
            word_phi = [] # contains word and phi values
            for word_type, weight in bow:
                phi_values = [] # contains (phi_value, topic) pairing to later be sorted
                phi_topic = [] # contains topic and corresponding phi value to be returned 'raw' to user
                for topic_id in range(0, self.num_topics):
                    if phis[topic_id][word_type] >= minimum_phi_value:
                        # appends phi values for each topic for that word
                        # these phi values are scaled by feature length
                        phi_values.append((phis[topic_id][word_type], topic_id))
                        phi_topic.append((topic_id, phis[topic_id][word_type]))

                # list with ({word_id => [(topic_0, phi_value), (topic_1, phi_value) ...]).
                word_phi.append((word_type, phi_topic))
                # sorts the topics based on most likely topic
                # returns a list like ({word_id => [topic_id_most_probable, topic_id_second_most_probable, ...]).
                sorted_phi_values = sorted(phi_values, reverse=True)
                topics_sorted = [x[1] for x in sorted_phi_values]
                word_topic.append((word_type, topics_sorted))
            return (document_topics, word_topic, word_phi) # returns 2-tuple
    def __getitem__(self, bow, eps=1e-12):
        """Get the tf-idf representation of an input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document in the `sparse Gensim bag-of-words format
            or a streamed corpus of such documents.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        vector : list of (int, float)
            TfIdf vector, if `bow` is a single document
            TfIdf corpus, if `bow` is a corpus.

        self.eps = eps
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:

        tf_array = self.wlocal(np.array(tf_array))

        vector = [
            (termid, tf * self.idfs.get(termid))
            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.pivot is None:
            norm_vector = self.normalize(vector)
            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
            _, old_norm = self.normalize(vector, return_norm=True)
            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
            norm_vector = [
                (termid, weight / float(pivoted_norm))
                for termid, weight in vector
                if abs(weight / float(pivoted_norm)) > self.eps
        return norm_vector
예제 #24
    def __getitem__(self, bow, scaled=False, chunksize=512):
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.
        This is done by folding input document into the latent topic space.
        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).
        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow,
        topic_dist = (
            vec.T *
            self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]
                          ) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
예제 #25
    def __getitem__(self, bow, eps=1e-12):
        """Get the tf-idf representation of an input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document in the `sparse Gensim bag-of-words format
            or a streamed corpus of such documents.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        vector : list of (int, float)
            TfIdf vector, if `bow` is a single document
            TfIdf corpus, if `bow` is a corpus.

        self.eps = eps
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:

        tf_array = self.wlocal(np.array(tf_array))

        vector = [
            (termid, tf * self.idfs.get(termid))
            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        if self.pivot is None:
            norm_vector = self.normalize(vector)
            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
            _, old_norm = self.normalize(vector, return_norm=True)
            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
            norm_vector = [
                (termid, weight / float(pivoted_norm))
                for termid, weight in vector
                if abs(weight / float(pivoted_norm)) > self.eps
        return norm_vector
예제 #26
    def test_getitem_dense2gensim(self):

        corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
                               dim=self.dim, sparse_serialization=False,

        item = corpus[3]
        self.assertTrue(isinstance(item, list))
        self.assertTrue(isinstance(item[0], tuple))

        dslice = corpus[2:6]
        self.assertTrue(next(dslice) == corpus[2])
        dslice = list(dslice)
        self.assertTrue(isinstance(dslice, list))
        self.assertTrue(isinstance(dslice[0], list))
        self.assertTrue(isinstance(dslice[0][0], tuple))

        iscorp, _ = is_corpus(dslice)
        self.assertTrue(iscorp, "Is the object returned by slice notation "
                                "a gensim corpus?")

        ilist = corpus[[2, 3, 4, 5]]
        self.assertTrue(next(ilist) == corpus[2])
        ilist = list(ilist)
        self.assertTrue(isinstance(ilist, list))
        self.assertTrue(isinstance(ilist[0], list))
        self.assertTrue(isinstance(ilist[0][0], tuple))

        # From generators to lists

        self.assertEqual(len(ilist), len(dslice))
        for i in xrange(len(ilist)):
            self.assertEqual(len(ilist[i]), len(dslice[i]),
                             "Row %d: dims %d/%d" % (i, len(ilist[i]),
            for j in xrange(len(ilist[i])):
                self.assertEqual(ilist[i][j], dslice[i][j],
                                 "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % (
                                     i, j, str(ilist[i][j]), i, j,

        iscorp, _ = is_corpus(ilist)
        self.assertTrue(iscorp, "Is the object returned by list notation "
                                "a gensim corpus?")
예제 #27
    def test_getitem_dense2gensim(self):

        corpus = ShardedCorpus(self.tmp_fname, self.data, shardsize=100,
                               dim=self.dim, sparse_serialization=False,

        item = corpus[3]
        self.assertTrue(isinstance(item, list))
        self.assertTrue(isinstance(item[0], tuple))

        dslice = corpus[2:6]
        self.assertTrue(next(dslice) == corpus[2])
        dslice = list(dslice)
        self.assertTrue(isinstance(dslice, list))
        self.assertTrue(isinstance(dslice[0], list))
        self.assertTrue(isinstance(dslice[0][0], tuple))

        iscorp, _ = is_corpus(dslice)
        self.assertTrue(iscorp, "Is the object returned by slice notation "
                                "a gensim corpus?")

        ilist = corpus[[2, 3, 4, 5]]
        self.assertTrue(next(ilist) == corpus[2])
        ilist = list(ilist)
        self.assertTrue(isinstance(ilist, list))
        self.assertTrue(isinstance(ilist[0], list))
        self.assertTrue(isinstance(ilist[0][0], tuple))

        # From generators to lists

        self.assertEqual(len(ilist), len(dslice))
        for i in xrange(len(ilist)):
            self.assertEqual(len(ilist[i]), len(dslice[i]),
                             "Row %d: dims %d/%d" % (i, len(ilist[i]),
            for j in xrange(len(ilist[i])):
                self.assertEqual(ilist[i][j], dslice[i][j],
                                 "ilist[%d][%d] = %s ,dslice[%d][%d] = %s" % (
                                     i, j, str(ilist[i][j]), i, j,

        iscorp, _ = is_corpus(ilist)
        self.assertTrue(iscorp, "Is the object returned by list notation "
                                "a gensim corpus?")
예제 #28
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)


        return doc
예제 #29
파일: lsimodel.py 프로젝트: abs51295/gensim
    def __getitem__(self, bow, scaled=False, chunksize=512):
        Return latent representation, as a list of (topic_id, topic_value) 2-tuples.

        This is done by folding input document into the latent topic space.

        If `scaled` is set, scale topics by the inverse of singular values (default: no scaling).

        assert self.projection.u is not None, "decomposition not initialized yet"

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus and chunksize:
            # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`.
            # this chunking is completely transparent to the user, but it speeds
            # up internal computations (one mat * mat multiplication, instead of
            # `chunksize` smaller mat * vec multiplications).
            return self._apply(bow, chunksize=chunksize)

        if not is_corpus:
            bow = [bow]

        # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication
        vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype)
        topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T  # (x^T * u).T = u^-1 * x

        # # convert input to dense, then do dense * dense multiplication
        # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse),
        # but consumes more memory
        # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow))
        # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec)

        # # use np's advanced indexing to simulate sparse * dense
        # # ± same speed again
        # u = self.projection.u[:, :self.num_topics]
        # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype)
        # for vecno, vec in enumerate(bow):
        #     indices, data = zip(*vec) if vec else ([], [])
        #     topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype))

        if not is_corpus:
            # convert back from matrix into a 1d vec
            topic_dist = topic_dist.reshape(-1)

        if scaled:
            topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist  # s^-1 * u^-1 * x

        # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight),
        # with no zero weights.
        if not is_corpus:
            # lsi[single_document]
            result = matutils.full2sparse(topic_dist)
            # lsi[chunk of documents]
            result = matutils.Dense2Corpus(topic_dist)
        return result
예제 #30
    def __getitem__(self, doc):
        # if doc is an iterable apply to all
        is_corpus, doc = utils.is_corpus(doc)
        if is_corpus:
            return self._apply(doc)

        # appling transformation, return doc as a bag-of-bitokens list
        allow_update = False if len(self.bidict) > 0 else True
        return self.bidict.doc2bob(doc, allow_update)
예제 #31
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        topic_dist = gamma[0] / sum(
            gamma[0])  # normalize to proper distribution
        return [topicvalue for topicid, topicvalue in enumerate(topic_dist)]
예제 #32
    def __getitem__(self, item):

        iscorpus, _ = is_corpus(item)

        if iscorpus or isinstance(item, DatasetABC):
            return self._apply(item)
            raise ValueError('Cannot apply flatten_composite to individual '
예제 #33
    def __getitem__(self, bow, eps=0.01):
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma = self.inference([bow])[0]
        topic_dist = gamma / sum(gamma) if sum(gamma) != 0 else []
        return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
                if topicvalue >= eps]
예제 #34
    def __getitem__(self, vec, eps=1e-12):
        is_corpus, vec = utils.is_corpus(vec)
        if is_corpus:
            return self._apply(vec)

        if self.L1:
            score = sum(v for _, v in vec) / len(vec) if vec else 0
            score = sum(v * v for _, v in vec) / len(vec) if vec else 0
        return score
    def __getitem__(self, items):
        is_corpus, items = utils.is_corpus(items)

        if not is_corpus:
            v = self._get_vector_representation(items)
            return sparse2full(v, self.size)
            return list(
                map(lambda v: sparse2full(v, self.size),
예제 #36
    def __getitem__(self, bow):
        Return representation with the ids transformed.
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
예제 #37
  def __getitem__(self, vec, eps=1e-12):
    is_corpus, vec = utils.is_corpus(vec)
    if is_corpus:
      return self._apply(vec)

    if self.L1:
      score = sum( v  for _, v in vec) / len(vec) if vec else 0
      score = sum(v*v for _, v in vec) / len(vec) if vec else 0
    return score
예제 #38
    def __getitem__(self, bow):
        Return representation with the ids transformed.
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)
예제 #39
파일: serializer.py 프로젝트: hajicj/safire
    def __getitem__(self, item):

        iscorpus, _ = is_corpus(item)

        if iscorpus:
            return self._apply(item)
            #raise ValueError('Cannot apply serializer to individual documents.')
            # Will this work?
            return self.serialized_data[item]
예제 #40
파일: ldamodel.py 프로젝트: zyenge/gensim
    def top_topics(self, corpus, num_topics=5, num_words=20):
        Calculate the Umass topic coherence for each topic and return
        the top num_topics. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        if num_topics < 0 or num_topics >= self.num_topics:
            if self.num_topics >= 5:
                num_topics = 5
                num_topics = self.num_topics
            logger.warning("num_topics out of range - setting to default of 5")
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")

        coherence_scores = []
        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum()  # normalize to probability dist
            bestn = np.argsort(topic)[::-1][:num_words]
            beststr = [(topic[id], self.id2word[id]) for id in bestn]
        top_id = chain.from_iterable(topics)
        top_id = list(set(top_id))

        doc_word_list = {}
        for id in top_id:
            id_list = []
            for document in range(len(corpus)):
                if len(list(ifilter(lambda x: x[0] == id,
                                    corpus[document]))) > 0:
            if len(id_list) > 0:
                doc_word_list[id] = id_list

        for topic in xrange(len(topics)):
            topic_coherence_sum = 0.0
            for word_m in topics[topic][1:]:
                doc_frequency_m = len(doc_word_list[word_m])
                m_set = set(doc_word_list[word_m])
                for word_l in topics[topic][:-1]:
                    l_set = set(doc_word_list[word_l])
                    co_doc_frequency = len(m_set.intersection(l_set))
                    topic_coherence_sum += numpy.log(
                        (co_doc_frequency + 1.0) / doc_frequency_m)
            coherence_scores.append((str_topics[topic], topic_coherence_sum))

        top_topics = sorted(coherence_scores,
                            key=lambda tup: tup[1],
                            reverse=True)[0:num_topics - 1]
        return top_topics
예제 #41
    def __getitem__(self, bow, iterations=100):
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s"
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations)
        logger.info("inferring with Mallet LDA with %s" % cmd)
        call(cmd, shell=True)
        return list(read_doctopics(self.fdoctopics() + '.infer'))
    def __getitem__(self, bow):
        """Get random-projection representation of the input vector or corpus.

        bow : {list of (int, int), iterable of list of (int, int)}
            Input document or corpus.

        list of (int, float)
            if `bow` is document OR
            if `bow` is corpus.


        .. sourcecode:: pycon

            >>> from gensim.models import RpModel
            >>> from gensim.corpora import Dictionary
            >>> from gensim.test.utils import common_texts
            >>> dictionary = Dictionary(common_texts)  # fit dictionary
            >>> corpus = [dictionary.doc2bow(text) for text in common_texts]  # convert texts to BoW format
            >>> model = RpModel(corpus, id2word=dictionary)  # fit model
            >>> # apply model to document, result is vector in BoW format, i.e. [(1, 0.3), ... ]
            >>> result = model[corpus[0]]

        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        if getattr(self, 'freshly_loaded', False):
            # This is a hack to work around a bug in np, where a FORTRAN-order array
            # unpickled from disk segfaults on using it.
            self.freshly_loaded = False
            self.projection = self.projection.copy(
                'F')  # simply making a fresh copy fixes the broken array

        vec = matutils.sparse2full(bow, self.num_terms).reshape(
            self.num_terms, 1) / np.sqrt(self.num_topics)
        vec = np.asfortranarray(vec, dtype=np.float32)
        topic_dist = np.dot(self.projection, vec)  # (k, d) * (d, 1) = (k, 1)
        return [
            (topicid, float(topicvalue))
            for topicid, topicvalue in enumerate(topic_dist.flat)
            if np.isfinite(topicvalue) and not np.allclose(topicvalue, 0.0)
예제 #43
    def __getitem__(self, bow, iterations=100):
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s"
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(),
                     self.fdoctopics() + '.infer', iterations)
        logger.info("inferring with Mallet LDA with %s" % cmd)
        call(cmd, shell=True)
        return list(read_doctopics(self.fdoctopics() + '.infer'))
    def __getitem__(self, items):
        Return random vector(s).
        :param items:
        is_corpus, items = utils.is_corpus(items)

        if not is_corpus:
            return np.random.random(self.size)
            return list(map(lambda v: np.random.random(self.size), items))
예제 #45
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.


        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        self.close_shard()  # no-op if no documents added to index since last query

        # reset num_best and normalize parameters, in case they were changed dynamically
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.norm

        # there are 4 distinct code paths, depending on whether input `query` is
        # a corpus (or numpy/scipy matrix) or a single document, and whether the
        # similarity result should be a full array or only num_best most similar
        # documents.
        pool, shard_results = self.query_shards(query)
        if self.num_best is None:
            # user asked for all documents => just stack the sub-results into a single matrix
            # (works for both corpus / single doc query)
            result = numpy.hstack(shard_results)
            # the following uses a lot of lazy evaluation and (optionally) parallel
            # processing, to improve query latency and minimize memory footprint.
            offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
            convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim) for doc_index, sim in doc]
            is_corpus, query = utils.is_corpus(query)
            is_corpus = is_corpus or hasattr(query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
            if not is_corpus:
                # user asked for num_best most similar and query is a single doc
                results = (convert(result, shard_no) for shard_no, result in enumerate(shard_results))
                result = heapq.nlargest(self.num_best, itertools.chain(*results), key=lambda item: item[1])
                # the trickiest combination: returning num_best results when query was a corpus
                results = []
                for shard_no, result in enumerate(shard_results):
                    shard_result = [convert(doc, shard_no) for doc in result]
                result = []
                for parts in izip(*results):
                    merged = heapq.nlargest(self.num_best, itertools.chain(*parts), key=lambda item: item[1])
        if pool:
            # gc doesn't seem to collect the Pools, eventually leading to
            # "IOError 24: too many open files". so let's terminate it manually.

        return result
예제 #46
    def __getitem__(self, query):
        """Get similarities of document `query` to all documents in the corpus.


        If `query` is a corpus (iterable of documents), return a matrix of similarities
        of all query documents vs. all corpus document. This batch query is more
        efficient than computing the similarities one document after another.
        )  # no-op if no documents added to index since last query

        # reset num_best and normalize parameters, in case they were changed dynamically
        for shard in self.shards:
            shard.num_best = self.num_best
            shard.normalize = self.normalize

        # there are 4 distinct code paths, depending on whether input `query` is
        # a corpus (or numpy/scipy matrix) or a single document, and whether the
        # similarity result is a full array or only num_best most similar documents.

        if self.num_best is None:
            # user asked for all documents => just stack the sub-results into a single matrix
            # (works for both corpus / single doc query)
            return numpy.hstack(shard[query] for shard in self.shards)

        offsets = numpy.cumsum([0] + [len(shard) for shard in self.shards])
        convert = lambda doc, shard_no: [(doc_index + offsets[shard_no], sim)
                                         for doc_index, sim in doc]
        is_corpus, query = utils.is_corpus(query)
        is_corpus = is_corpus or hasattr(
            query, 'ndim') and query.ndim > 1 and query.shape[0] > 1
        if not is_corpus:
            # user asked for num_best most similar and query is a single doc
            results = (convert(shard[query], shard_no)
                       for shard_no, shard in enumerate(self.shards))
            return heapq.nlargest(self.num_best,
                                  key=lambda item: item[1])

        # the trickiest combination: returning num_best results when query was a corpus
        shard_results = []
        for shard_no, shard in enumerate(self.shards):
            shard_result = [convert(doc, shard_no) for doc in shard[query]]
        result = []
        for parts in itertools.izip(*shard_results):
            merged = heapq.nlargest(self.num_best,
                                    key=lambda item: item[1])
        return result
예제 #47
    def __getitem__(self, bow, iterations=100):
        is_corpus, corpus = utils.is_corpus(bow)
        if not is_corpus:
            # query is a single document => make a corpus out of it
            bow = [bow]

        self.convert_input(bow, infer=True)
        cmd = self.mallet_path + ' infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %s'
        cmd = cmd % (self.fcorpusmallet() + '.infer', self.finferencer(), self.fdoctopics() + '.infer', iterations, self.topic_threshold)
        logger.info("inferring topics with MALLET LDA '%s'", cmd)
        check_output(args=cmd, shell=True)
        result = list(self.read_doctopics(self.fdoctopics() + '.infer'))
        return result if is_corpus else result[0]
예제 #48
    def __getitem__(self, query):
        """Get access to similarities of document/corpus `query` to all documents in the corpus.

        Using :meth:`~gensim.interfaces.SimilarityABC.get_similarities`

        Passing corpus to `query` (instead of document) can be more efficient, because will processed in batching-way.

        query : {list of (int, int), iterable of list of (int, int)}
            Document or corpus in BoW format.

        {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
            Similarities given document or corpus and objects corpus, depends on `query`.

        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if not matutils.ismatrix(query):
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if maintain_sparsity is True, result is scipy sparse. Sort, clip the
        # topn and return as a scipy sparse matrix.
        if getattr(self, 'maintain_sparsity', False):
            return matutils.scipy2scipy_clipped(result, self.num_best)

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [
                matutils.full2sparse_clipped(v, self.num_best) for v in result
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best)
예제 #49
    def __getitem__(self, query):
        """Get similarities of the given document or corpus against this index.

        Uses :meth:`~gensim.interfaces.SimilarityABC.get_similarities` internally.

        Passing an entire corpus as `query` can be more efficient than passing its documents one after another,
        because it will issue queries in batches internally.

        query : {list of (int, number), iterable of list of (int, number)}
            Document in the sparse Gensim bag-of-words format, or a streamed corpus of such documents.

        {`scipy.sparse.csr.csr_matrix`, list of (int, float)}
            Similarities given document or corpus and objects corpus, depends on `query`.

        is_corpus, query = utils.is_corpus(query)
        if self.normalize:
            # self.normalize only works if the input is a plain gensim vector/corpus (as
            # advertised in the doc). in fact, input can be a numpy or scipy.sparse matrix
            # as well, but in that case assume tricks are happening and don't normalize
            # anything (self.normalize has no effect).
            if not matutils.ismatrix(query):
                if is_corpus:
                    query = [matutils.unitvec(v) for v in query]
                    query = matutils.unitvec(query)
        result = self.get_similarities(query)

        if self.num_best is None:
            return result

        # if maintain_sparsity is True, result is scipy sparse. Sort, clip the
        # topn and return as a scipy sparse matrix.
        if getattr(self, 'maintain_sparsity', False):
            return matutils.scipy2scipy_clipped(result, self.num_best)

        # if the input query was a corpus (=more documents), compute the top-n
        # most similar for each document in turn
        if matutils.ismatrix(result):
            return [
                matutils.full2sparse_clipped(v, self.num_best) for v in result
            # otherwise, return top-n of the single input document
            return matutils.full2sparse_clipped(result, self.num_best)
예제 #50
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Do not use this function directly; use the `self[query]` syntax instead.

        query : {list of (int, number), iterable of list of (int, number), :class:`scipy.sparse.csr_matrix`}
            Document or collection of documents.

            Similarity matrix (if maintain_sparsity=False) **OR**

        is_corpus, query = utils.is_corpus(query)
        if is_corpus:
            query = matutils.corpus2csc(query,
            if scipy.sparse.issparse(query):
                query = query.T  # convert documents=rows to documents=columns
            elif isinstance(query, numpy.ndarray):
                if query.ndim == 1:
                    query.shape = (1, len(query))
                query = scipy.sparse.csr_matrix(query,
                # default case: query is a single vector, in sparse gensim format
                query = matutils.corpus2csc([query],

        # compute cosine similarity against every other document in the collection
        result = self.index * query.tocsc()  # N x T * T x C = N x C
        if result.shape[1] == 1 and not is_corpus:
            # for queries of one document, return a 1d array
            result = result.toarray().flatten()
        elif self.maintain_sparsity:
            # avoid converting to dense array if maintaining sparsity
            result = result.T
            # otherwise, return a 2d matrix (#queries x #index)
            result = result.toarray().T
        return result
예제 #51
파일: rpmodel.py 프로젝트: ummae/gensim
    def __getitem__(self, bow):
        Return RP representation of the input vector and/or corpus.
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        vec = matutils.sparse2full(bow, self.num_terms).reshape(self.num_terms, 1) / numpy.sqrt(self.num_topics)
        vec = numpy.asfortranarray(vec, dtype=numpy.float32)
        topic_dist = scipy.linalg.fblas.sgemv(1.0, self.projection, vec)  # (k, d) * (d, 1) = (k, 1)
        return [(topicid, float(topicvalue)) for topicid, topicvalue in enumerate(topic_dist.flat)
                if numpy.isfinite(topicvalue) and not numpy.allclose(topicvalue, 0.0)]
예제 #52
 def test_invalid_formats(self):
     # test invalid formats
     # these are no corpus, because they do not consists of 2-tuples with
     # the form(int, float).
     potentials = list()
     potentials.append(["human", "star"])
     potentials.append([1, 2, 3, 4, 5, 5])
     potentials.append([[(0, 'string')]])
     for noCorpus in potentials:
         result = utils.is_corpus(noCorpus)
         expected = (False, noCorpus)
         self.assertEqual(expected, result)
예제 #53
    def __getitem__(self, bow):
        Return log entropy representation of the input vector and/or corpus.
        # if the input vector is in fact a corpus, return a transformed corpus
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge)
        vector = [(term_id, math.log(tf + 1) * self.entr.get(term_id))
                  for term_id, tf in bow if term_id in self.entr]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector
예제 #54
	def __getitem__(self, bow, iterations=100):
		is_corpus, corpus = utils.is_corpus(bow)
		if not is_corpus:
			# query is a single document => make a corpus out of it
			bow = [bow]

		self.convert_input(bow, infer=True)
		cmd = self.mallet_path + " infer-topics --input %s --inferencer %s --output-doc-topics %s --num-iterations %s --doc-topics-threshold %f"
		cmd = cmd % (self.fcorpusmallet(True) + '.infer', self.finferencer(True), self.fdoctopics(True) + '.infer', iterations, 1/(self.num_topics))
		logger.info("inferring topics with MALLET LDA '%s'" % cmd)
		retval = call(cmd, shell=True)
		if retval != 0:
			raise RuntimeError("MALLET failed with error %s on return" % retval)
		result = list(gensim.models.wrappers.ldamallet.read_doctopics(self.fdoctopics(True) + '.infer'))
		return result if is_corpus else result[0]
예제 #55
    def __getitem__(self, bow, eps=1e-12):
        """Get tf-idf representation of the input vector and/or corpus.

        bow : {list of (int, int), iterable of iterable of (int, int)}
            Input document or copus in BoW format.
        eps : float
            Threshold value, will remove all position that have tfidf-value less than `eps`.

        vector : list of (int, float)
            TfIdf vector, if `bow` is document **OR**
            TfIdf corpus, if `bow` is corpus.

        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)

        termid_array, tf_array = [], []
        for termid, tf in bow:

        tf_array = self.wlocal(np.array(tf_array))

        vector = [(termid, tf * self.idfs.get(termid))
                  for termid, tf in zip(termid_array, tf_array)
                  if abs(self.idfs.get(termid, 0.0)) > eps]

        if self.normalize is True:
            self.normalize = matutils.unitvec
        elif self.normalize is False:
            self.normalize = utils.identity

        # and finally, normalize the vector either to unit length, or use a
        # user-defined normalization function
        vector = self.normalize(vector)

        # make sure there are no explicit zeroes in the vector (must be sparse)
        vector = [(termid, weight) for termid, weight in vector
                  if abs(weight) > eps]
        return vector
예제 #56
    def __getitem__(self, bow, eps=0.01):
        Return topic distribution for the given document `bow`, as a list of
        (topic_id, topic_probability) 2-tuples.

        Ignore topics with very low probability (below `eps`).
        # if the input vector is in fact a corpus, return a transformed corpus as result
        is_corpus, corpus = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(corpus)

        gamma, _ = self.inference([bow])
        topic_dist = gamma[0] / sum(gamma[0]) # normalize to proper distribution
        return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist)
                if topicvalue >= eps] # ignore document's topics that have prob < eps
예제 #57
    def __getitem__(self, bow):
        Return tf-idf representation of the input vector and/or corpus.
        # if the input vector is in fact a corpus, return a transformed corpus as a result
        is_corpus, bow = utils.is_corpus(bow)
        if is_corpus:
            return self._apply(bow)

        # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
        # as strict application of the IDF formula would dictate)
        vector = [(termid, tf * self.idfs.get(termid, 0.0))
                  for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0]
        if self.normalize:
            vector = matutils.unitvec(vector)
        return vector
예제 #58
    def __getitem__(self, bow, eps=0.01):
        is_corpus, dummy_corpus = utils.is_corpus(bow)
        if not is_corpus:
            bow = [bow]

        predictions = self._predict(bow)[0]

        topics = []
        for row in predictions:
            row_topics = []
            for topic_id, val in enumerate(row):
                if val > eps:
                    row_topics.append((topic_id, val))

        return topics if is_corpus else topics[0]
예제 #59
    def get_document_topics(self,
        """Get the topic distribution for the given document.

        bow : list of (int, float)
            The document in BOW format.
        minimum_probability : float
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        list of (int, float)
            Topic distribution for the whole document. Each element in the list is a pair of a topic's id, and
            the probability that was assigned to it.

        if minimum_probability is None:
            minimum_probability = self.minimum_probability
        minimum_probability = max(minimum_probability, 1e-8)

        # if the input vector is a corpus, return a transformed corpus
        is_corpus, corpus = utils.is_corpus(bow)

        if is_corpus:
            kwargs = dict(minimum_probability=minimum_probability)
            return self._apply(corpus, **kwargs)

        v = matutils.corpus2csc([bow], self.num_tokens)
        h = self._solveproj(v, self._W, v_max=np.inf)

        if normalize is None:
            normalize = self.normalize
        if normalize:
            the_sum = h.sum()
            if the_sum:
                h /= the_sum

        return [(idx, proba) for idx, proba in enumerate(h[:, 0])
                if not minimum_probability or proba > minimum_probability]
예제 #60
    def get_similarities(self, query):
        """Get similarity between `query` and this index.

        Do not use this function directly; use the `self[query]` syntax instead.

        query : {list of (int, number), iterable of list of (int, number)}
            Document or collection of documents.

            Similarity matrix.


        is_corpus, query = utils.is_corpus(query)
        if not is_corpus:
            if isinstance(query, numpy.ndarray):
                # Convert document indexes to actual documents.
                query = [self.corpus[i] for i in query]
                query = [query]

        result = []
        for query_document in query:
            # Compute similarity for each query.
            qresult = [
                matutils.softcossim(query_document, corpus_document,
                for corpus_document in self.corpus
            qresult = numpy.array(qresult)

            # Append single query result to list of all results.

        if is_corpus:
            result = numpy.array(result)
            result = result[0]

        return result