Пример #1
0
 def _get_topics(self):
     """Internal helper function to return topics from a trained topic model."""
     topics = []  # FIXME : Meant to work for LDAModel, LdaVowpalWabbit right now. Make it work for others.
     if isinstance(self.model, LdaModel):
         for topic in self.model.state.get_lambda():
             bestn = argsort(topic, topn=10, reverse=True)
             topics.append(bestn)
     elif isinstance(self.model, LdaVowpalWabbit):
         for topic in self.model._get_topics():
             bestn = argsort(topic, topn=10, reverse=True)
             topics.append(bestn)
     return topics
Пример #2
0
    def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
        """
        Print the `num_words` most probable words for `num_topics` number of topics.
        Set `num_topics=-1` to print all topics.

        Set `formatted=True` to return the topics as a list of strings, or `False` as lists of (weight, word) pairs.

        """
        if num_topics < 0 or num_topics >= self.num_topics:
            num_topics = self.num_topics
            chosen_topics = range(num_topics)
        else:
            num_topics = min(num_topics, self.num_topics)
            # add a little random jitter, to randomize results around the same alpha
            sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha))
            sorted_topics = list(matutils.argsort(sort_alpha))
            chosen_topics = sorted_topics[: num_topics // 2] + sorted_topics[-num_topics // 2:]
        shown = []
        for i in chosen_topics:
            if formatted:
                topic = self.print_topic(i, topn=num_words)
            else:
                topic = self.show_topic(i, topn=num_words)
            shown.append((i, topic))
            if log:
                logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic)
        return shown
Пример #3
0
    def top_topics_as_word_lists(model, dictionary, topn=20):
        """Get `topn` topics as list of words.

        Parameters
        ----------
        model : :class:`~gensim.models.basemodel.BaseTopicModel`
            Pre-trained topic model.
        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
            Gensim dictionary mapping of id word.
        topn : int, optional
            Integer corresponding to the number of top words to be extracted from each topic.

        Return
        ------
        list of list of str
            Top topics in list-of-list-of-words format.

        """
        if not dictionary.id2token:
            dictionary.id2token = {v: k for k, v in dictionary.token2id.items()}

        str_topics = []
        for topic in model.get_topics():
            bestn = matutils.argsort(topic, topn=topn, reverse=True)
            beststr = [dictionary.id2token[_id] for _id in bestn]
            str_topics.append(beststr)
        return str_topics
Пример #4
0
    def show_topic(self, topicno, topn=10):
        """Get the words that define a topic along with their contribution.

        This is actually the left singular vector of the specified topic.

        The most important words in defining the topic (greatest absolute value) are included
        in the output, along with their contribution to the topic.

        Parameters
        ----------
        topicno : int
            The topics id number.
        topn : int
            Number of words to be included to the result.

        Returns
        -------
        list of (str, float)
            Topic representation in BoW format.

        """
        # size of the projection matrix can actually be smaller than `self.num_topics`,
        # if there were not enough factors (real rank of input matrix smaller than
        # `self.num_topics`). in that case, return an empty string
        if topicno >= len(self.projection.u.T):
            return ''
        c = np.asarray(self.projection.u.T[topicno, :]).flatten()
        norm = np.sqrt(np.sum(np.dot(c, c)))
        most = matutils.argsort(np.abs(c), topn, reverse=True)
        return [(self.id2word[val], 1.0 * c[val] / norm) for val in most]
Пример #5
0
    def _get_topics_from_model(model, topn):
        """Internal helper function to return topics from a trained topic model.

        Parameters
        ----------
        model : :class:`~gensim.models.basemodel.BaseTopicModel`
            Pre-trained topic model.
        topn : int
            Integer corresponding to the number of top words.

        Return
        ------
        list of :class:`numpy.ndarray`
            Topics matrix

        """
        try:
            return [
                matutils.argsort(topic, topn=topn, reverse=True) for topic in
                model.get_topics()
            ]
        except AttributeError:
            raise ValueError(
                "This topic model is not currently supported. Supported topic models"
                " should implement the `get_topics` method.")
Пример #6
0
 def optimal_ordering(self):
     """Performs ordering on the topics."""
     idx = matutils.argsort(self.m_lambda_sum, reverse=True)
     self.m_varphi_ss = self.m_varphi_ss[idx]
     self.m_lambda = self.m_lambda[idx, :]
     self.m_lambda_sum = self.m_lambda_sum[idx]
     self.m_Elogbeta = self.m_Elogbeta[idx, :]
Пример #7
0
    def get_topic_terms(self, topicid, topn=10, normalize=None):
        """Get the representation for a single topic. Words the integer IDs, in constrast to
        :meth:`~gensim.models.nmf.Nmf.show_topic` that represents words by the actual strings.

        Parameters
        ----------
        topicid : int
            The ID of the topic to be returned
        topn : int, optional
            Number of the most significant words that are associated with the topic.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of (int, float)
            Word ID - probability pairs for the most relevant words generated by the topic.

        """
        topic = self._W[:, topicid]

        if normalize is None:
            normalize = self.normalize
        if normalize:
            topic /= topic.sum()

        bestn = matutils.argsort(topic, topn, reverse=True)
        return [(idx, topic[idx]) for idx in bestn]
Пример #8
0
    def top_topics(self, corpus, num_words=20):
        """
        Calculate the Umass topic coherence for each topic. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        """
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")
            return

        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum()  # normalize to probability distribution
            bestn = matutils.argsort(topic, topn=num_words, reverse=True)
            topics.append(bestn)
            beststr = [(topic[id], self.id2word[id]) for id in bestn]
            str_topics.append(beststr)

        # top_ids are limited to every topics top words. should not exceed the
        # vocabulary size.
        top_ids = set(chain.from_iterable(topics))

        # create a document occurence sparse matrix for each word
        doc_word_list = {}
        for id in top_ids:
            id_list = set()
            for n, document in enumerate(corpus):
                if id in frozenset(x[0] for x in document):
                    id_list.add(n)

            doc_word_list[id] = id_list

        coherence_scores = []
        for t, top_words in enumerate(topics):
            # Calculate each coherence score C(t, top_words)
            coherence = 0.0
            # Sum of top words m=2..M
            for m in top_words[1:]:
                # m_docs is v_m^(t)
                m_docs = doc_word_list[m]

                # Sum of top words l=1..m-1
                # i.e., all words ranked higher than the current word m
                for l in top_words[:m - 1]:
                    # l_docs is v_l^(t)
                    l_docs = doc_word_list[l]

                    # make sure this word appears in some documents.
                    if len(l_docs) > 0:
                        # co_doc_frequency is D(v_m^(t), v_l^(t))
                        co_doc_frequency = len(m_docs.intersection(l_docs))

                        # add to the coherence sum for these two words m, l
                        coherence += numpy.log((co_doc_frequency + 1.0) / len(l_docs))

            coherence_scores.append((str_topics[t], coherence))

        top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
        return top_topics
Пример #9
0
    def show_topic(self, topicid, topn=10, num_words=None):
        """Get `num_words` most probable words for the given `topicid`.

        Parameters
        ----------
        topicid : int
            Id of topic.
        topn : int, optional
            Top number of topics that you'll receive.
        num_words : int, optional
            DEPRECATED PARAMETER, use `topn` instead.

        Returns
        -------
        list of (str, float)
            Sequence of probable words, as a list of `(word, word_probability)` for `topicid` topic.

        """
        if num_words is not None:  # deprecated num_words is used
            warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.")
            topn = num_words

        if self.word_topics is None:
            logger.warning("Run train or load_word_topics before showing topics.")
        topic = self.word_topics[topicid]
        topic = topic / topic.sum()  # normalize to probability dist
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(self.id2word[idx], topic[idx]) for idx in bestn]
        return beststr
Пример #10
0
    def print_topics(self, ldamodel, topn=10):
        
        Lambda = ldamodel.state.get_lambda()

        Phi = Lambda / Lambda.sum(axis=1)[:, np.newaxis]
        Phi2 =  Lambda / Lambda.sum(axis=0)[np.newaxis, :]
        entropy = np.zeros(Phi2.shape[1])
        topics = ""

        # calcula a entropia Ew≜∑kp(k|w)logp(k|w)
        for w in range(Phi2.shape[1]):
            for k in range(Phi2.shape[0]):
                entropy[w] += Phi2[k,w]*np.log2(Phi2[k,w]+1e-100)
        print(entropy)

        # calcula p(w|k)e−Hw
        for k in range(Phi.shape[0]):
            for w in range(Phi.shape[1]):
                Phi[k,w] = Phi[k,w]/pow(math.e,(-1)*entropy[w])
        for k in range(Phi.shape[0]):
            bestn = matutils.argsort(Phi[k], topn, reverse=True)
            topic_terms = [(id, Phi[k,id]) for id in bestn]
            lda_words = [(ldamodel.id2word[id], value) for id, value in topic_terms]    
            topics += ' + '.join(['%.3f*%s' % (v, k) for k, v in lda_words])+"\n"
        return topics
Пример #11
0
    def show_topic(self, topicid, time, topn=50, num_words=None):
        """Get `num_words` most probable words for the given `topicid`.

        Parameters
        ----------
        topicid : int
            Id of topic.
        time : int
            Timestamp.
        topn : int, optional
            Top number of topics that you'll receive.
        num_words : int, optional
            DEPRECATED PARAMETER, use `topn` instead.

        Returns
        -------
        list of (float, str)
            Sequence of probable words, as a list of `(word_probability, word)`.

        """
        if num_words is not None:  # deprecated num_words is used
            warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.")
            topn = num_words

        topics = self.lambda_[:, :, time]
        topic = topics[topicid]
        # likelihood to probability
        topic = np.exp(topic)
        # normalize to probability dist
        topic = topic / topic.sum()
        # sort according to prob
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(topic[idx], self.id2word[idx]) for idx in bestn]
        return beststr
Пример #12
0
def most_similar_to_vec(vector,model,topn, list_words):
    dists = np.dot(model.syn0norm, vector)

    best = matutils.argsort(dists, topn=topn + len(list_words), reverse=True)
    # ignore (don't return) words from the input
    result = [(model.index2word[sim], float(dists[sim])) for sim in best if model.index2word[sim] not in list_words]

    return result[:topn]
Пример #13
0
 def show_topic(self, topicid, num_words=10):
     if self.word_topics is None:
         logger.warn("Run train or load_word_topics before showing topics.")
     topic = self.word_topics[topicid]
     topic = topic / topic.sum()  # normalize to probability dist
     bestn = matutils.argsort(topic, num_words, reverse=True)
     beststr = [(topic[id], self.id2word[id]) for id in bestn]
     return beststr
Пример #14
0
    def most_similar(self, words={}, topn=10, restrict_vocab=None):
        """
        Find the top-N most similar words. 

        words : a dict where the words are the keys and the weights are the values. 

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given words and the vectors for each word in the model.
        The method corresponds to the `word-analogy` and `distance` scripts in the original
        word2vec implementation.
        If topn is False, most_similar returns the vector of similarity scores.
        `restrict_vocab` is an optional integer which limits the range of vectors which
        are searched for most-similar values. For example, restrict_vocab=10000 would
        only check the first 10000 word vectors in the vocabulary order. (This may be
        meaningful if you've sorted the vocabulary by descending frequency.)
        Example::
          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
          [('queen', 0.50882536), ...]
        """
        self.init_sims()

        # if isinstance(positive, string_types) and not negative:
        #     # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
        #     positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        # positive = [
        #     (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word
        #     for word in positive
        # ]
        # negative = [
        #     (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word
        #     for word in negative
        # ]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in words.items():
            if isinstance(word, ndarray):
                mean.append(weight * word)
            elif word in self.vocab:
                mean.append(weight * self.syn0norm[self.vocab[word].index])
                all_words.add(self.vocab[word].index)
            else:
                Warning("word '%s' not in vocabulary" % word)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab]
        dists = dot(limited, mean)
        if not topn:
            return dists
        best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
        return result[:topn]
Пример #15
0
 def _get_topics(self):
     """Internal helper function to return topics from a trained topic model."""
     topics = []
     if isinstance(self.model, LdaModel):
         for topic in self.model.state.get_lambda():
             bestn = argsort(topic, topn=self.topn, reverse=True)
             topics.append(bestn)
     elif isinstance(self.model, LdaVowpalWabbit):
         for topic in self.model._get_topics():
             bestn = argsort(topic, topn=self.topn, reverse=True)
             topics.append(bestn)
     elif isinstance(self.model, LdaMallet):
         for topic in self.model.word_topics:
             bestn = argsort(topic, topn=self.topn, reverse=True)
             topics.append(bestn)
     else:
         raise ValueError("This topic model is not currently supported. Supported topic models "
                          " are LdaModel, LdaVowpalWabbit and LdaMallet.")
     return topics
Пример #16
0
    def top_topics_as_word_lists(model, dictionary, topn=20):
        if not dictionary.id2token:
            dictionary.id2token = {v: k for k, v in dictionary.token2id.items()}

        str_topics = []
        for topic in model.get_topics():
            bestn = matutils.argsort(topic, topn=topn, reverse=True)
            beststr = [dictionary.id2token[_id] for _id in bestn]
            str_topics.append(beststr)
        return str_topics
Пример #17
0
    def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
        """
        For `num_topics` number of topics, return `num_words` most significant words
        (10 words per topic, by default).

        The topics are returned as a list -- a list of strings if `formatted` is
        True, or a list of `(word, probability)` 2-tuples if False.

        If `log` is True, also output this result to log.

        Unlike LSA, there is no natural ordering between the topics in LDA.
        The returned `num_topics <= self.num_topics` subset of all topics is therefore
        arbitrary and may change between two LDA training runs.

        """
        if num_topics < 0 or num_topics >= self.num_topics:
            num_topics = self.num_topics
            chosen_topics = range(num_topics)
        else:
            num_topics = min(num_topics, self.num_topics)

            # add a little random jitter, to randomize results around the same alpha
            sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))

            sorted_topics = list(matutils.argsort(sort_alpha))
            chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]

        shown = []

        topic = self.state.get_lambda()
        for i in chosen_topics:
            topic_ = topic[i]
            topic_ = topic_ / topic_.sum()  # normalize to probability distribution
            bestn = matutils.argsort(topic_, num_words, reverse=True)
            topic_ = [(self.id2word[id], topic_[id]) for id in bestn]
            if formatted:
                topic_ = ' + '.join(['%.3f*"%s"' % (v, k) for k, v in topic_])

            shown.append((i, topic_))
            if log:
                logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic_)

        return shown
Пример #18
0
    def most_similar(self, positive=[], negative=[], topn=10, clip_start=0, clip_end=None, indexer=None):
        """
        Find the top-N most similar docvecs known from training. Positive docs contribute
        positively towards the similarity, negative docs negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given docs. Docs may be specified as vectors, integer indexes
        of trained docvecs, or if the documents were originally presented with string tags,
        by the corresponding tags.

        The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous
        range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering
        there was chosen to be significant, such as more popular tag IDs in lower indexes.)
        """
        self.init_sims()
        clip_end = clip_end or len(self.doctag_syn0norm)

        if isinstance(positive, string_types + integer_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs
        positive = [
            (doc, 1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
            else doc for doc in positive
        ]
        negative = [
            (doc, -1.0) if isinstance(doc, string_types + (ndarray,) + integer_types)
            else doc for doc in negative
        ]

        # compute the weighted average of all docs
        all_docs, mean = set(), []
        for doc, weight in positive + negative:
            if isinstance(doc, ndarray):
                mean.append(weight * doc)
            elif doc in self.doctags or doc < self.count:
                mean.append(weight * self.doctag_syn0norm[self._int_index(doc)])
                all_docs.add(self._int_index(doc))
            else:
                raise KeyError("doc '%s' not in trained set" % doc)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        if indexer is not None:
            return indexer.most_similar(mean, topn)

        dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean)
        if not topn:
            return dists
        best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True)
        # ignore (don't return) docs from the input
        result = [(self.index_to_doctag(sim), float(dists[sim])) for sim in best if sim not in all_docs]
        return result[:topn]
Пример #19
0
    def most_similar_cosmul(self, positive=[], negative=[], topn=10):
        """
        Find the top-N most similar words, using the multiplicative combination objective
        proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute
        positively towards the similarity, negative words negatively, but with less
        susceptibility to one large distance dominating the calculation.

        In the common analogy-solving case, of two positive and one negative examples,
        this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg.

        Additional positive or negative examples contribute to the numerator or denominator,
        respectively – a potentially sensible but untested extension of the method. (With
        a single positive example, rankings will be the same as in the default most_similar.)

        Example::

          >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
          [(u'iraq', 0.8488819003105164), ...]

        .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.

        """
        self.init_sims()

        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
            positive = [positive]

        all_words = set([self.vocab[word].index for word in positive+negative
            if not isinstance(word, ndarray) and word in self.vocab])

        positive = [
            self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
            for word in positive
        ]
        negative = [
            self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word
            for word in negative
        ]

        if not positive:
            raise ValueError("cannot compute similarity with no input")

        # equation (4) of Levy & Goldberg "Linguistic Regularities...",
        # with distances shifted to [0,1] per footnote (7)
        pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
        neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative]
        dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001)

        if not topn:
            return dists
        best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words]
        return result[:topn]
Пример #20
0
    def top_topics(self, corpus, texts=None, dictionary=None, window_size=None,
                   coherence='u_mass', topn=20, processes=-1):
        """Get the topics sorted by coherence.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        texts : list of list of str, optional
            Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
            probability estimator .
        dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Dictionary mapping of id word to create corpus.
            If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
        window_size : int, optional
            Is the size of the window to be used for coherence measures using boolean sliding window as their
            probability estimator. For 'u_mass' this doesn't matter.
            If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
        coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
            Coherence measure to be used.
            Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
            For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
            using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
        topn : int, optional
            Integer corresponding to the number of top words to be extracted from each topic.
        processes : int, optional
            Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
            num_cpus - 1.

        Returns
        -------
        list of (list of (int, str), float)
            Each element in the list is a pair of a topic representation and its coherence score. Topic representations
            are distributions of words, represented as a list of pairs of word IDs and their probabilities.

        """
        cm = CoherenceModel(
            model=self, corpus=corpus, texts=texts, dictionary=dictionary,
            window_size=window_size, coherence=coherence, topn=topn,
            processes=processes
        )
        coherence_scores = cm.get_coherence_per_topic()

        str_topics = []
        for topic in self.get_topics():  # topic = array of vocab_size floats, one per term
            bestn = matutils.argsort(topic, topn=topn, reverse=True)  # top terms for topic
            beststr = [(topic[_id], self.id2word[_id]) for _id in bestn]  # membership, token
            str_topics.append(beststr)  # list of topn (float membership, token) tuples

        scored_topics = zip(str_topics, coherence_scores)
        return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
Пример #21
0
 def _get_topics_from_model(model, topn):
     """Internal helper function to return topics from a trained topic model."""
     try:
         return [
             matutils.argsort(topic, topn=topn, reverse=True) for topic in
             model.get_topics()
         ]
     except AttributeError:
         raise ValueError(
             "This topic model is not currently supported. Supported topic models"
             " should implement the `get_topics` method.")
Пример #22
0
 def testAccumulatorCachingWithModelSetting(self):
     kwargs = dict(corpus=self.corpus, dictionary=self.dictionary, coherence='u_mass')
     cm1 = CoherenceModel(topics=self.topics1, **kwargs)
     cm1.estimate_probabilities()
     self.assertIsNotNone(cm1._accumulator)
     cm1.model = self.ldamodel
     topics = []
     for topic in self.ldamodel.state.get_lambda():
         bestn = argsort(topic, topn=cm1.topn, reverse=True)
         topics.append(bestn)
     self.assertTrue(np.array_equal(topics, cm1.topics))
     self.assertIsNone(cm1._accumulator)
Пример #23
0
    def show_topic(self, topicid, topn=10, num_words=None):
        if num_words is not None:  # deprecated num_words is used
            warnings.warn("The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead.")
            topn = num_words

        if self.word_topics is None:
            logger.warning("Run train or load_word_topics before showing topics.")
        topic = self.word_topics[topicid]
        topic = topic / topic.sum()  # normalize to probability dist
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(self.id2word[idx], topic[idx]) for idx in bestn]
        return beststr
Пример #24
0
    def get_topic_terms(self, topicid, topn=10):
        """
        Return a list of `(word_id, probability)` 2-tuples for the most
        probable words in topic `topicid`.

        Only return 2-tuples for the topn most probable words (ignore the rest).

        """
        topic = self.state.get_lambda()[topicid]
        topic = topic / topic.sum()  # normalize to probability distribution
        bestn = matutils.argsort(topic, topn, reverse=True)
        return [(id, topic[id]) for id in bestn]
Пример #25
0
  def most_similar(self, sWord, iTopN=10, fMinDist=-1.0):
    npaWord_unit = self.getUnitVector(sWord)

    if npaWord_unit is None:
      return None

    npaCosineSimilarities = np.dot(self.npaWordEmbeddings_units, npaWord_unit)

    npaBestIndices = \
        matutils.argsort(npaCosineSimilarities, topn=iTopN +1, reverse=True)

    # npaBestIndices[1:] - Ignore the first one (which is sWord itself)
    return [(self.oVocab.index2word(x), npaCosineSimilarities[x]) for x in npaBestIndices[1:] if npaCosineSimilarities[x] > fMinDist]
Пример #26
0
  def most_similar_simple(self, sWord, iTopN=10):
    npaWordEmbedding = self[sWord]

    if npaWordEmbedding is None:
      return None

    npaSimilarities = np.dot(self.npaWordEmbeddings, npaWordEmbedding)

    npaBestIndices = \
        matutils.argsort(npaSimilarities, topn=iTopN +1, reverse=True)

    # npaBestIndices[1:] - Ignore the first one (which is sWord itself)
    return [(self.oVocab.index2word(x), npaSimilarities[x]) for x in npaBestIndices[1:]]
Пример #27
0
 def print_topic(self, topic, time=0, top_terms=20):
     """
     Topic is the topic number
     Time is for a particular time_slice
     top_terms is the number of terms to display
     """
     topic = self.topic_chains[topic].e_log_prob
     topic = numpy.transpose(topic)
     topic = numpy.exp(topic[time])
     topic = topic / topic.sum()
     bestn = matutils.argsort(topic, top_terms, reverse=True)
     beststr = [(self.id2word[id_], round(topic[id_], 3)) for id_ in bestn]
     return beststr
Пример #28
0
    def show_topic(self, topicid, topn=10, num_words=None):
        if num_words is not None:  # deprecated num_words is used
            logger.warning("The parameter num_words for show_topic() would be deprecated in the updated version.")
            logger.warning("Please use topn instead.")
            topn = num_words

        if self.word_topics is None:
            logger.warning("Run train or load_word_topics before showing topics.")
        topic = self.word_topics[topicid]
        topic = topic / topic.sum()  # normalize to probability dist
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(self.id2word[idx], topic[idx]) for idx in bestn]
        return beststr
Пример #29
0
    def show_topic(self, topicid, topn=10):
        """
        Return a list of `(words_probability, word)` 2-tuples for the most probable
        words in topic `topicid`.

        Only return 2-tuples for the topn most probable words (ignore the rest).

        """
        topic = self.state.get_lambda()[topicid]
        topic = topic / topic.sum()  # normalize to probability distribution
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(topic[id], self.id2word[id]) for id in bestn]
        return beststr
Пример #30
0
def reject_words_1(A, B, model = model):
  '''Takes two **LIST OF WORDS** and
  returns most_similar for word A, while rejecting words with meanings closer to B.
  Seems to work better than just giving in negative words.
  ''' 
  in_words = A+B
  basic_word = [model[each] for each in A]
  reject_word = [model[each] for each in B]
  basic_mean = matutils.unitvec(array(basic_word).mean(axis=0)).astype(REAL)
  reject_mean = matutils.unitvec(array(reject_word).mean(axis=0)).astype(REAL)
  r = reject(basic_mean, reject_mean)
  dists = np.linalg.linalg.dot(model.syn0norm, r)
  best  = matutils.argsort(dists, topn = 500, reverse = True)
  result = [(model.index2word[sim], float(dists[sim])) for sim in best if model.index2word[sim] not in in_words]
  return result
Пример #31
0
    def mostSimilarSent(self, sent, query, allDoc, topn):

        words2 = query.split()

        try:
            words2.remove(u'\ufeff')
        except ValueError:
            words2 = words2

        v2 = numpy.array([self[word] for word in words2], dtype=object)
        mean = matutils.unitvec(array(v2).mean(axis=0))

        print "starting search dist"
        dists = dot(allDoc[0:None], mean)
        best = matutils.argsort(dists, topn, reverse=True)
        print "done!"
        result = []
        for index in best:
            result.append(sent[index])

        return result
Пример #32
0
def temporal_change(ldaseq, whichtopic=0,wordtime=0,npick=5):
    # for topic 0, pick the top 5 words at time 0 and see their frequency evolution
    # TODO: pick random 5 words (top words at different times)

    topicP = ldaseq.topic_chains[whichtopic].e_log_prob
    topicP = np.transpose(topicP)
    wordids = matutils.argsort(topicP[wordtime], npick, reverse=True)

    wfreqs = np.empty((len(time_slice),npick))
    for kt in range(len(time_slice)):
        topic = np.exp(topicP[kt])
        topic = topic/sum(topic)

        wfreqs[kt] = np.array([topic[id_] for id_ in wordids])


    plt.plot(wfreqs,'-+')
    plt.yticks(wfreqs[0],[ldaseq.id2word[id_] for id_ in wordids])
    plt.xticks(np.arange(len(time_slice)))
    plt.title('topic %d'%(whichtopic+1))
    plt.show()
Пример #33
0
def aysn_file_flush(dists_all_temp, prei_temp):
    sim_num = dists_all_temp.shape[1]
    line_msgs = ''
    for j in range(sim_num):
        real_index = prei_temp + j
        dists = dists_all_temp[:, j]
        uquery = index2word[real_index]
        best = matutils.argsort(dists, 100, reverse=True)
        bestwords = [
            index2word[simindex] + '(' + str(dists[simindex]) + ')'
            for simindex in best
            if simindex != real_index and dists[simindex] >= 0.5
        ]
        if len(bestwords) == 0:
            continue
        line_msg = uquery + '\t' + ' '.join(bestwords)
        line_msgs += line_msg + '\n'
    mutex.acquire()
    simoutpathfile.write(line_msgs.encode('utf-8'))
    simoutpathfile.flush()
    mutex.release()
def paper2vec_recommend(context):
    """ Make recommendations based on the Paper2vec vectors."""
    #if not hasattr(papervecs, 'syn0'):
    #    raise RuntimeError("Parameters required for predicting the output words not found.")
    topn = 500
    context_words_list = context.split()
    sleep(0.3)
    # REMEMBER: Here, papervecs.wv.vocab contains not words, but docids
    # Use the doc2vec wv 
    word_vocabs = [model.wv.vocab[w] for w in context_words_list if w in model.wv.vocab]
    word2_indices = [word.index for word in word_vocabs]
    l1 = np.sum(model.wv.syn0[word2_indices], axis=0)
    if word2_indices:
        l1 /= len(word2_indices)
    prob_values = np.exp(np.dot(l1, papervecs.syn0.T))
    #prob_values = np.exp(np.dot(l1, model.docvecs.doctag_syn0.T))
    prob_values = np.nan_to_num(prob_values)
    prob_values /= sum(prob_values)
    # some of the vectors in papervecs stand for docs, some just for words (where are these ids coming from?)
    top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
    return [papervecs.index2entity[index1] for index1 in top_indices]
Пример #35
0
    def show_topic(self, topicno, topn=10):
        """
        Return a specified topic (=left singular vector), 0 <= `topicno` < `self.num_topics`,
        as a string.

        Return only the `topn` words which contribute the most to the direction
        of the topic (both negative and positive).

        >>> lsimodel.show_topic(10, topn=5)
        [("category", -0.340), ("$M$", 0.298), ("algebra", 0.183), ("functor", -0.174), ("operator", -0.168)]

        """
        # size of the projection matrix can actually be smaller than `self.num_topics`,
        # if there were not enough factors (real rank of input matrix smaller than
        # `self.num_topics`). in that case, return an empty string
        if topicno >= len(self.projection.u.T):
            return ''
        c = np.asarray(self.projection.u.T[topicno, :]).flatten()
        norm = np.sqrt(np.sum(np.dot(c, c)))
        most = matutils.argsort(np.abs(c), topn, reverse=True)
        return [(self.id2word[val], 1.0 * c[val] / norm) for val in most]
Пример #36
0
    def get_topic_terms(self, topic_id, topn=10, readable=True):
        # TODO move this and similar methods to parent class
        """

        Args:
            topic_id:
            topn:
            readable: If False returns term_id, if True returns the actual word.

        Returns:
             A list of tuples (term, prob) of the topn terms in topic_id, formated according to format.

        """

        topic_term_probs = self.phi[topic_id]
        bestn = matutils.argsort(topic_term_probs, topn, reverse=True)
        if readable:
            return [(self.id2word[idx], topic_term_probs[idx])
                    for idx in bestn]
        else:
            return [(idx, topic_term_probs[idx]) for idx in bestn]
Пример #37
0
    def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None):
        if dictionary is None:
            raise ValueError('no dictionary!')

        if topic_data is not None:
            topics = topic_data
        elif topic_file is not None:
            topics = np.loadtxt('%s' % topic_file)
        else:
            raise ValueError('no topic data!')

        # sort topics
        topics_sums = np.sum(topics, axis=1)
        idx = matutils.argsort(topics_sums, reverse=True)
        self.data = topics[idx]

        self.dictionary = dictionary

        if style is None:
            style = self.STYLE_GENSIM

        self.style = style
Пример #38
0
    def show_topics(self,
                    num_topics: int = 10,
                    num_words: int = 10,
                    log: bool = False) -> list[tuple[int, list[str, float]]]:
        """Get the `num_words` most probable words for `num_topics` number of topics.

        Parameters
        ----------
        num_topics : int, optional
            Number of topics to return, set `-1` to get all topics.
        num_words : int, optional
            Number of words.
        log : bool, optional
            If True - write topic with logging too, used for debug proposes.

        Returns
        -------
        list of (float, str)
            Topics as list of (weight, word) pairs

        """
        if num_topics < 0 or num_topics >= self.num_topics:
            num_topics = self.num_topics
            chosen_topics = range(num_topics)
        else:
            num_topics = min(num_topics, self.num_topics)
            # add a little random jitter, to randomize results around the same alpha
            sort_alpha = self.alpha + 0.0001 * numpy.random.rand(
                len(self.alpha))
            sorted_topics = list(matutils.argsort(sort_alpha))
            chosen_topics = sorted_topics[:num_topics //
                                          2] + sorted_topics[-num_topics // 2:]
        shown = []
        for i in chosen_topics:
            topic = self.show_topic(i, topn=num_words)
            shown.append((i, topic))
            if log:
                logger.info("topic #%i (%.3f): %s", i, self.alpha[i], topic)
        return shown
Пример #39
0
def compute_dt_dist(docs, labels, tags, model, max_len, batch_size, pad_id, idxvocab, output_file):
    #generate batches
    num_batches = int(math.ceil(float(len(docs)) / batch_size))
    dt_dist = []
    t = []
    combined = []
    docid = 0
    for i in xrange(num_batches):
        x, _, _, t, s = get_batch_doc(docs, labels, tags, i, max_len, cf.tag_len, batch_size, pad_id)
        attention, mean_topic = sess.run([model.attention, model.mean_topic], {model.doc: x, model.tag: t})
        dt_dist.extend(attention[:s])

        if debug:
            for si in xrange(s):
                d = x[si]
                print "\n\nDoc", docid, "=", " ".join([idxvocab[item] for item in d if (item != pad_id)])
                sorted_dist = matutils.argsort(attention[si], reverse=True)
                for ti in sorted_dist:
                    print "Topic", ti, "=", attention[si][ti]
                docid += 1

    np.save(open(output_file, "w"), dt_dist)
    def show_topic(self, topicid, time, topn=50, num_words=None):
        """
        Return `num_words` most probable words for the given `topicid`, as a list of
        `(word_probability, word)` 2-tuples.

        """
        if num_words is not None:  # deprecated num_words is used
            warnings.warn(
                "The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead."
            )
            topn = num_words

        topics = self.lambda_[:, :, time]
        topic = topics[topicid]
        # likelihood to probability
        topic = np.exp(topic)
        # normalize to probability dist
        topic = topic / topic.sum()
        # sort according to prob
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(topic[idx], self.id2word[idx]) for idx in bestn]
        return beststr
Пример #41
0
    def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
        """
        For `num_topics` number of topics, return `num_words` most significant words
        (10 words per topic, by default).

        The topics are returned as a list -- a list of strings if `formatted` is
        True, or a list of (probability, word) 2-tuples if False.

        If `log` is True, also output this result to log.

        Unlike LSA, there is no natural ordering between the topics in LDA.
        The returned `num_topics <= self.num_topics` subset of all topics is therefore
        arbitrary and may change between two LDA training runs.

        """
        if num_topics < 0 or num_topics >= self.num_topics:
            num_topics = self.num_topics
            chosen_topics = range(num_topics)
        else:
            num_topics = min(num_topics, self.num_topics)

            # add a little random jitter, to randomize results around the same alpha
            sort_alpha = self.alpha + 0.0001 * numpy.random.rand(len(self.alpha))

            sorted_topics = list(matutils.argsort(sort_alpha))
            chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]

        shown = []
        for i in chosen_topics:
            if formatted:
                topic = self.print_topic(i, topn=num_words)
            else:
                topic = self.show_topic(i, topn=num_words)

            shown.append(topic)
            if log:
                logger.info("topic #%i (%.3f): %s" % (i, self.alpha[i], topic))

        return shown
Пример #42
0
def get_nodes_link_to(model, node, topn):
    """
    re-use implementation of gensim

    :param model: gensim.models.Word2Vec
    :param node:
    :param topn:
    :return:
    """

    word_vocabs = [model.wv.vocab[node]]
    word2_indices = [word.index for word in word_vocabs]

    l1 = np.sum(model.trainables.syn1neg[word2_indices], axis=0)
    if word2_indices and model.cbow_mean:
        l1 /= len(word2_indices)

    prob_values = np.exp(np.dot(l1, model.wv.vectors.T))
    prob_values /= sum(prob_values)
    top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
    return [(model.wv.index2word[index1], prob_values[index1])
            for index1 in top_indices]
Пример #43
0
def predict_output_word(model,
                        context_words_list,
                        topn=10,
                        do_sorting=True,
                        possible_mutations=None,
                        vocab_indices=None):
    """Modified function from method of Word2Vec class from gensim library"""
    word_vocabs = [
        model.wv.vocab[w] for w in context_words_list if w in model.wv.vocab
    ]

    word2_indices = [word.index for word in word_vocabs]

    l1 = np.sum(model.wv.vectors[word2_indices], axis=0)
    if word2_indices and model.cbow_mean:
        l1 /= len(word2_indices)

    # propagate hidden -> output and take softmax to get probabilities
    prob_values = np.exp(np.dot(l1, model.trainables.syn1neg.T))
    prob_values /= sum(prob_values)

    if do_sorting:
        top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)

    else:
        top_indices = list(range(len(prob_values)))[:topn]

    if not possible_mutations and not vocab_indices:
        # returning the most probable output words with their probabilities
        return [(model.wv.index2word[index1], prob_values[index1])
                for index1 in top_indices]
    elif vocab_indices:
        return [prob_values[index1] for index1 in vocab_indices]
    else:
        return [
            prob_values[index1] for index1 in top_indices
            if model.wv.index2word[index1] in possible_mutations
        ]
Пример #44
0
    def show_topic(self, topicid, time, topn=50, num_words=None):
        """
        Return `num_words` most probable words for the given `topicid`, as a list of
        `(word_probability, word)` 2-tuples.

        """
        if num_words is not None:  # deprecated num_words is used
            logger.warning(
                "The parameter num_words for show_topic() would be deprecated in the updated version."
            )
            logger.warning("Please use topn instead.")
            topn = num_words

        topics = self.lambda_[:, :, time]
        topic = topics[topicid]
        # liklihood to probability
        topic = np.exp(topic)
        # normalize to probability dist
        topic = topic / topic.sum()
        # sort according to prob
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(topic[id], self.id2word[id]) for id in bestn]
        return beststr
def hd2v_recommend(context):
    """ Recommend based on the hyperdoc2vec model using IN and OUT vectors"""
    topn = 500
    context_words_list = context.split()
    word_vocabs = [
        hd2vmodel.wv.vocab[w] for w in context_words_list
        if w in hd2vmodel.wv.vocab
    ]
    word2_indices = [word.index for word in word_vocabs]
    sleep(0.2)
    # Get the sum of the IN word vectors
    l1 = np.sum(hd2vmodel.wv.syn0[word2_indices], axis=0)
    # And the sum of the OUT word vectors
    l2 = np.sum(hd2vmodel.syn1neg[word2_indices], axis=0)
    if word2_indices:
        l2 /= len(word2_indices)
        l1 /= len(word2_indices)
    # Following hd2v code, e^(sumwvIN.docvecIN + sumwvOUT.docvecOUT)
    prob_values = exp(
        dot(l1, hd2vmodel.docvecs.doctag_syn1neg.T) +
        dot(l2, hd2vmodel.docvecs.doctag_syn0.T))
    prob_values = nan_to_num(prob_values)
    top_indices = matutils.argsort(prob_values, topn=topn, reverse=True)
    return [hd2vmodel.docvecs.offset2doctag[index1] for index1 in top_indices]
def calculate_text_similar(vec_ques, matrix_org_norm, matrix_org_index,
                           top_vec):
    """
      最相似的句子,句向量与矩阵点乘
    :param vec: 
    :param matrix: 
    :param keys: 
    :param topn: 
    :return: 
    """
    # 问句向量标准化, Scale a vector to unit length. The only exception is the zero vector, which is returned back unchanged.
    vec_ques_mean = matutils.unitvec(np.array(
        [vec_ques]).mean(axis=0)).astype(numpy_type)
    # 矩阵点乘, 即问句与标准问句库里边的问句点乘,
    matrix_vec_dot = np.dot(matrix_org_norm, vec_ques_mean)
    # 相似度排序
    most_similar_sentence_vec_sort = matutils.argsort(matrix_vec_dot,
                                                      topn=top_vec,
                                                      reverse=True)
    # 获取最相似标准问句的index和得分score
    index_score = []
    for t in most_similar_sentence_vec_sort[:top_vec]:
        index_score.append([matrix_org_index[t], float(matrix_vec_dot[t])])
    return index_score
Пример #47
0
    def show_topic(self, topicid, time, topn=50, num_words=None):
        """Get `num_words` most probable words for the given `topicid`.

        Parameters
        ----------
        topicid : int
            Id of topic.
        time : int
            Timestamp.
        topn : int, optional
            Top number of topics that you'll receive.
        num_words : int, optional
            DEPRECATED PARAMETER, use `topn` instead.

        Returns
        -------
        list of (float, str)
            Sequence of probable words, as a list of `(word_probability, word)`.

        """
        if num_words is not None:  # deprecated num_words is used
            warnings.warn(
                "The parameter `num_words` is deprecated, will be removed in 4.0.0, use `topn` instead."
            )
            topn = num_words

        topics = self.lambda_[:, :, time]
        topic = topics[topicid]
        # likelihood to probability
        topic = np.exp(topic)
        # normalize to probability dist
        topic = topic / topic.sum()
        # sort according to prob
        bestn = matutils.argsort(topic, topn, reverse=True)
        beststr = [(topic[idx], self.id2word[idx]) for idx in bestn]
        return beststr
Пример #48
0
    def top_topics(self, corpus, num_words=20):
        """
        Calculate the Umass topic coherence for each topic. Algorithm from
        **Mimno, Wallach, Talley, Leenders, McCallum: Optimizing Semantic Coherence in Topic Models, CEMNLP 2011.**
        """
        is_corpus, corpus = utils.is_corpus(corpus)
        if not is_corpus:
            logger.warning("LdaModel.top_topics() called with an empty corpus")
            return

        topics = []
        str_topics = []
        for topic in self.state.get_lambda():
            topic = topic / topic.sum(
            )  # normalize to probability distribution
            bestn = matutils.argsort(topic, topn=num_words, reverse=True)
            topics.append(bestn)
            beststr = [(topic[id], self.id2word[id]) for id in bestn]
            str_topics.append(beststr)

        # top_ids are limited to every topics top words. should not exceed the
        # vocabulary size.
        top_ids = set(chain.from_iterable(topics))

        # create a document occurence sparse matrix for each word
        doc_word_list = {}
        for id in top_ids:
            id_list = set()
            for n, document in enumerate(corpus):
                if id in frozenset(x[0] for x in document):
                    id_list.add(n)

            doc_word_list[id] = id_list

        coherence_scores = []
        for t, top_words in enumerate(topics):
            # Calculate each coherence score C(t, top_words)
            coherence = 0.0
            # Sum of top words m=2..M
            for m in top_words[1:]:
                # m_docs is v_m^(t)
                m_docs = doc_word_list[m]
                m_index = numpy.where(top_words == m)[0]

                # Sum of top words l=1..m-1
                # i.e., all words ranked higher than the current word m
                for l in top_words[:m_index - 1]:
                    # l_docs is v_l^(t)
                    l_docs = doc_word_list[l]

                    # make sure this word appears in some documents.
                    if len(l_docs) > 0:
                        # co_doc_frequency is D(v_m^(t), v_l^(t))
                        co_doc_frequency = len(m_docs.intersection(l_docs))

                        # add to the coherence sum for these two words m, l
                        coherence += numpy.log(
                            (co_doc_frequency + 1.0) / len(l_docs))

            coherence_scores.append((str_topics[t], coherence))

        top_topics = sorted(coherence_scores, key=lambda t: t[1], reverse=True)
        return top_topics
Пример #49
0
 def label_rank(self, X):
     scores = self.scores(X)
     return scores, argsort(scores, reverse=True)
Пример #50
0
    def most_similar(
        self,
        positive: [int, ndarray] = None,
        negative: [int, ndarray] = None,
        indexable: [IndexedList, IndexedLineDocument] = None,
        topn: int = 10,
        restrict_size: [int, Tuple[int, int]] = None,
    ) -> List[Tuple[int, float]]:
        """Find the top-N most similar sentences.
        Positive sentences contribute positively towards the similarity, negative sentences negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given sentences and the vectors for each sentence in the model.

        Parameters
        ----------
        positive : list of int, optional
            List of indices that contribute positively.
        negative : list of int, optional
            List of indices that contribute negatively.
        indexable: list, IndexedList, IndexedLineDocument
            Provides an indexable object from where the most similar sentences are read
        topn : int or None, optional
            Number of top-N similar sentences to return, when `topn` is int. When `topn` is None,
            then similarities for all sentences are returned.
        restrict_size : int or Tuple(int,int), optional
            Optional integer which limits the range of vectors which
            are searched for most-similar values. For example, restrict_vocab=10000 would
            only check the first 10000 sentence vectors.
            restrict_vocab=(500, 1000) would search the sentence vectors with indices between
            500 and 1000.

        Returns
        -------
        list of (int, float) or list of (str, int, float)
            A sequence of (index, similarity) is returned.
            When an indexable is provided, returns (str, index, similarity)
            When `topn` is None, then similarities for all words are returned as a
            one-dimensional numpy array with the size of the vocabulary.

        """
        if indexable is not None and not hasattr(indexable, "__getitem__"):
            raise RuntimeError("Indexable must provide __getitem__")
        if positive is None:
            positive = []
        if negative is None:
            negative = []

        self.init_sims()

        if isinstance(positive, (int, integer)) and not negative:
            positive = [positive]
        if isinstance(positive, (ndarray)) and not negative:
            if len(positive.shape) == 1:
                positive = [positive]

        positive = [
            (sent, 1.0) if isinstance(sent, (int, integer, ndarray)) else sent
            for sent in positive
        ]
        negative = [
            (sent, -1.0) if isinstance(sent, (int, integer, ndarray)) else sent
            for sent in negative
        ]

        all_sents, mean = set(), []
        for sent, weight in positive + negative:
            if isinstance(sent, ndarray):
                mean.append(weight * sent)
            else:
                mean.append(weight *
                            self.get_vector(index=sent, use_norm=True))
                if sent in self:
                    all_sents.add(sent)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        if isinstance(restrict_size, (int, integer)):
            lo, hi = 0, restrict_size
        elif isinstance(restrict_size, Tuple):
            lo, hi = restrict_size
        else:
            lo, hi = 0, None

        limited = (self.vectors_norm
                   if restrict_size is None else self.vectors_norm[lo:hi])
        dists = dot(limited, mean)
        if not topn:
            return dists
        best = matutils.argsort(dists,
                                topn=topn + len(all_sents),
                                reverse=True)
        best_off = best + lo

        if indexable is not None:
            result = [(indexable[off_idx], off_idx, float(dists[idx]))
                      for off_idx, idx in zip(best_off, best)
                      if off_idx not in all_sents]
        else:
            result = [(off_idx, float(dists[idx]))
                      for off_idx, idx in zip(best_off, best)
                      if off_idx not in all_sents]
        return result[:topn]
Пример #51
0
    def most_similar(self,
                     positive=[],
                     negative=[],
                     topn=10,
                     clip_start=0,
                     clip_end=None):
        """
        Find the top-N most similar docvecs known from training. Positive docs contribute
        positively towards the similarity, negative docs negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given docs. Docs may be specified as vectors, integer indexes
        of trained docvecs, or if the documents were originally presented with string tags,
        by the corresponding tags.

        The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous
        range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering
        there was chosen to be significant, such as more popular tag IDs in lower indexes.)
        """
        self.init_sims()
        clip_end = clip_end or len(self.doctag_syn0norm)

        if isinstance(positive, string_types + integer_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs
        positive = [
            (doc,
             1.0) if isinstance(doc, string_types + (ndarray, ) +
                                integer_types) else doc for doc in positive
        ]
        negative = [
            (doc,
             -1.0) if isinstance(doc, string_types + (ndarray, ) +
                                 integer_types) else doc for doc in negative
        ]

        # compute the weighted average of all docs
        all_docs, mean = set(), []
        for doc, weight in positive + negative:
            if isinstance(doc, ndarray):
                mean.append(weight * doc)
            elif doc in self.doctags or doc < self.count:
                mean.append(weight *
                            self.doctag_syn0norm[self._int_index(doc)])
                all_docs.add(self._int_index(doc))
            else:
                raise KeyError("doc '%s' not in trained set" % doc)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean)
        if not topn:
            return dists
        best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True)
        # ignore (don't return) docs from the input
        result = [(self.index_to_doctag(sim), float(dists[sim]))
                  for sim in best if sim not in all_docs]
        return result[:topn]
Пример #52
0
    def show_topics(self,
                    num_topics=10,
                    num_words=10,
                    log=False,
                    formatted=True,
                    normalize=None):
        """Get the topics sorted by sparsity.

        Parameters
        ----------
        num_topics : int, optional
            Number of topics to be returned. Unlike LSA, there is no natural ordering between the topics in NMF.
            The returned topics subset of all topics is therefore arbitrary and may change between two NMF
            training runs.
        num_words : int, optional
            Number of words to be presented for each topic. These will be the most relevant words (assigned the highest
            probability for each topic).
        log : bool, optional
            Whether the result is also logged, besides being returned.
        formatted : bool, optional
            Whether the topic representations should be formatted as strings. If False, they are returned as
            2 tuples of (word, probability).
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.

        Returns
        -------
        list of {str, tuple of (str, float)}
            a list of topics, each represented either as a string (when `formatted` == True) or word-probability
            pairs.

        """
        if normalize is None:
            normalize = self.normalize

        # Compute fraction of zero elements in each column

        sparsity = np.zeros(self._W.shape[1])

        for row in self._W:
            sparsity += (row == 0)

        sparsity /= self._W.shape[0]

        if num_topics < 0 or num_topics >= self.num_topics:
            num_topics = self.num_topics
            chosen_topics = range(num_topics)
        else:
            num_topics = min(num_topics, self.num_topics)

            sorted_topics = list(matutils.argsort(sparsity))
            chosen_topics = (sorted_topics[:num_topics // 2] +
                             sorted_topics[-num_topics // 2:])

        shown = []

        topics = self.get_topics(normalize=normalize)

        for i in chosen_topics:
            topic = topics[i]
            bestn = matutils.argsort(topic, num_words, reverse=True).ravel()
            topic = [(self.id2word[id], topic[id]) for id in bestn]
            if formatted:
                topic = " + ".join(['%.3f*"%s"' % (v, k) for k, v in topic])

            shown.append((i, topic))
            if log:
                logger.info("topic #%i (%.3f): %s", i, sparsity[i], topic)

        return shown
Пример #53
0
def get_labels(topic_num):
    valdoc2vec = 0.0
    valword2vec = 0.0
    cnt = 0
    store_indices = []

    print "Processing Topic number " + str(topic_num)
    for item in topic_list[topic_num]:
        try:
            tempdoc2vec = model1.syn0norm[
                model1.vocab[item].
                index]  # The word2vec value of topic word from doc2vec trained model
        except:
            pass
        else:
            meandoc2vec = matutils.unitvec(tempdoc2vec).astype(
                REAL)  # Getting the unit vector
            distsdoc2vec = dot(
                model1.docvecs.doctag_syn0norm, meandoc2vec
            )  # The dot product of all labels in doc2vec with the unit vector of topic word
            valdoc2vec = valdoc2vec + distsdoc2vec

        try:
            tempword2vec = model2.syn0norm[
                model2.vocab[item].
                index]  # The word2vec value of topic word from word2vec trained model
        except:
            pass
        else:
            meanword2vec = matutils.unitvec(tempword2vec).astype(
                REAL)  # Unit vector

            distsword2vec = dot(
                model3, meanword2vec
            )  # The dot prodiuct of all possible labels in word2vec vocab with the unit vector of topic word
            """
            This next section of code checks if the topic word is also a potential label in trained word2vec model. If that is the case, it is 
            important the dot product of label with that topic word is not taken into account.Hence we make that zero and further down the code
            also exclude it in taking average of that label over all topic words. 

            """

            if (model2.vocab[item].index) in w_indices:

                i_val = w_indices.index(model2.vocab[item].index)
                store_indices.append(i_val)
                distsword2vec[i_val] = 0.0
            valword2vec = valword2vec + distsword2vec

    avgdoc2vec = valdoc2vec / float(len(
        topic_list[topic_num]))  # Give the average vector over all topic words
    avgword2vec = valword2vec / float(
        len(topic_list[topic_num]
            ))  # Average of word2vec vector over all topic words

    bestdoc2vec = matutils.argsort(
        avgdoc2vec, topn=100,
        reverse=True)  # argsort and get top 100 doc2vec label indices
    resultdoc2vec = []
    # Get the doc2vec labels from indices
    for elem in bestdoc2vec:
        ind = d_indices[elem]
        temp = model1.docvecs.index_to_doctag(ind)
        resultdoc2vec.append((temp, float(avgdoc2vec[elem])))

    # This modifies the average word2vec vector for cases in which the word2vec label was same as topic word.
    for element in store_indices:
        avgword2vec[element] = (avgword2vec[element] * len(
            topic_list[topic_num])) / (float(len(topic_list[topic_num]) - 1))

    bestword2vec = matutils.argsort(
        avgword2vec, topn=100,
        reverse=True)  #argsort and get top 100 word2vec label indices
    # Get the word2vec labels from indices
    resultword2vec = []
    for element in bestword2vec:
        ind = w_indices[element]
        temp = model2.index2word[ind]
        resultword2vec.append((temp, float(avgword2vec[element])))

    # Get the combined set of both doc2vec labels and word2vec labels
    comb_labels = list(
        set([i[0] for i in resultdoc2vec] + [i[0] for i in resultword2vec]))
    newlist_doc2vec = []
    newlist_word2vec = []

    # Get indices from combined labels
    for elem in comb_labels:
        try:

            newlist_doc2vec.append(
                d_indices.index(model1.docvecs.doctags[elem].offset))
            temp = get_word(elem)
            newlist_word2vec.append(w_indices.index(model2.vocab[temp].index))

        except:
            pass
    newlist_doc2vec = list(set(newlist_doc2vec))
    newlist_word2vec = list(set(newlist_word2vec))

    # Finally again get the labels from indices. We searched for the score from both doctvec and word2vec models
    resultlist_doc2vecnew = [(model1.docvecs.index_to_doctag(d_indices[elem]),
                              float(avgdoc2vec[elem]))
                             for elem in newlist_doc2vec]
    resultlist_word2vecnew = [(model2.index2word[w_indices[elem]],
                               float(avgword2vec[elem]))
                              for elem in newlist_word2vec]

    # Finally get the combined score with the label. The label used will be of doc2vec not of word2vec.
    new_score = []
    for item in resultlist_word2vecnew:
        k, v = item
        for elem in resultlist_doc2vecnew:
            k2, v2 = elem
            k3 = get_word(k2)
            if k == k3:
                v3 = v + v2
                new_score.append((k2, v3))
    new_score = sorted(new_score, key=lambda x: x[1], reverse=True)
    return new_score[:(int(args.num_cand_labels))]
    def most_similar_cosmul(self, positive=None, negative=None, topn=10):
        """
        Find the top-N most similar words, using the multiplicative combination objective
        proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute
        positively towards the similarity, negative words negatively, but with less
        susceptibility to one large distance dominating the calculation.

        In the common analogy-solving case, of two positive and one negative examples,
        this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg.

        Additional positive or negative examples contribute to the numerator or denominator,
        respectively – a potentially sensible but untested extension of the method. (With
        a single positive example, rankings will be the same as in the default most_similar.)

        Example::

          >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london'])
          [(u'iraq', 0.8488819003105164), ...]

        .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014.

        """
        if positive is None:
            positive = []
        if negative is None:
            negative = []

        self.init_sims()

        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog'])
            positive = [positive]

        all_words = {
            self.vocab[word].index
            for word in positive + negative
            if not isinstance(word, ndarray) and word in self.vocab
        }

        positive = [
            self.word_vec(word, use_norm=True) if isinstance(
                word, string_types) else word for word in positive
        ]
        negative = [
            self.word_vec(word, use_norm=True) if isinstance(
                word, string_types) else word for word in negative
        ]

        if not positive:
            raise ValueError("cannot compute similarity with no input")

        # equation (4) of Levy & Goldberg "Linguistic Regularities...",
        # with distances shifted to [0,1] per footnote (7)
        pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive]
        neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative]
        dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001)

        if not topn:
            return dists
        best = matutils.argsort(dists,
                                topn=topn + len(all_words),
                                reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best
                  if sim not in all_words]
        return result[:topn]
Пример #55
0
    def most_similar(self,
                     positive=None,
                     negative=None,
                     topn=10,
                     restrict_vocab=None,
                     indexer=None):
        """
        Find the top-N most similar words. Positive words contribute positively towards the
        similarity, negative words negatively.

        This method computes cosine similarity between a simple mean of the projection
        weight vectors of the given words and the vectors for each word in the model.
        The method corresponds to the `word-analogy` and `distance` scripts in the original
        word2vec implementation.

        If topn is False, most_similar returns the vector of similarity scores.

        `restrict_vocab` is an optional integer which limits the range of vectors which
        are searched for most-similar values. For example, restrict_vocab=10000 would
        only check the first 10000 word vectors in the vocabulary order. (This may be
        meaningful if you've sorted the vocabulary by descending frequency.)

        Example::

          >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man'])
          [('queen', 0.50882536), ...]

        """
        if positive is None:
            positive = []
        if negative is None:
            negative = []

        self.init_sims()

        if isinstance(positive, string_types) and not negative:
            # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog'])
            positive = [positive]

        # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words
        positive = [
            (word,
             1.0) if isinstance(word, string_types + (ndarray, )) else word
            for word in positive
        ]
        negative = [
            (word,
             -1.0) if isinstance(word, string_types + (ndarray, )) else word
            for word in negative
        ]

        # compute the weighted average of all words
        all_words, mean = set(), []
        for word, weight in positive + negative:
            if isinstance(word, ndarray):
                mean.append(weight * word)
            else:
                mean.append(weight * self.word_vec(word, use_norm=True))
                if word in self.vocab:
                    all_words.add(self.vocab[word].index)
        if not mean:
            raise ValueError("cannot compute similarity with no input")
        mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL)

        if indexer is not None:
            return indexer.most_similar(mean, topn)

        limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:
                                                                             restrict_vocab]
        dists = dot(limited, mean)
        if not topn:
            return dists
        best = matutils.argsort(dists,
                                topn=topn + len(all_words),
                                reverse=True)
        # ignore (don't return) words from the input
        result = [(self.index2word[sim], float(dists[sim])) for sim in best
                  if sim not in all_words]
        return result[:topn]
Пример #56
0
def best_items(topic, dictionary, f, n):
    data = [(i, score) for i, score in enumerate(topic)
            if score > 0 and f(dictionary, i)]
    indices, scores = zip(*data)
    return [indices[i] for i in matutils.argsort(scores, n, reverse=True)]
Пример #57
0
    def new_accuracy(self,
                     questions,
                     restrict_vocab=30000,
                     most_similar=most_similar,
                     case_insensitive=True):
        """
        Compute accuracy of the model. `questions` is a filename where lines are
        4-tuples of words, split into sections by ": SECTION NAME" lines.
        See questions-words.txt in
        https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip
        for an example.

        The accuracy is reported (=printed to log and returned as a list) for each
        section separately, plus there's one aggregate summary at the end.

        Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab`
        words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency.
        In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then
        case normalization is performed.

        Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before
        evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens
        and question words. In case of multiple case variants of a single word, the vector for the first
        occurrence (also the most frequent if vocabulary is sorted) is taken.

        This method corresponds to the `compute-accuracy` script of the original C word2vec.

        """
        print("INFO: Using new accuracy")
        ok_vocab = [(w, self.vocab[w])
                    for w in self.index2word[:restrict_vocab]]
        ok_vocab = {w.upper(): v
                    for w, v in reversed(ok_vocab)
                    } if case_insensitive else dict(ok_vocab)

        oov_counter, idx_cnt, is_vn_counter = 0, 0, 0
        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)

            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self.log_accuracy(section)
                section = {
                    'section': line.lstrip(': ').strip(),
                    'correct': [],
                    'incorrect': []
                }
            else:
                # Count number of analogy to check
                idx_cnt += 1
                if not section:
                    raise ValueError(
                        "missing section header before line #%i in %s" %
                        (line_no, questions))
                try:
                    if case_insensitive:
                        a, b, c, expected = [
                            word.upper() for word in line.split(" | ")
                        ]
                    else:
                        a, b, c, expected = [
                            word for word in line.split(" | ")
                        ]
                        # print("Line : ", line)
                        # print("a, b, c, expected: %s, %s, %s, %s"%(a, b, c, expected))
                        # input(">>> Wait ...")
                except ValueError:
                    logger.info("SVX: ERROR skipping invalid line #%i in %s",
                                line_no, questions)
                    print("Line : ", line)
                    print("a, b, c, expected: %s, %s, %s, %s" %
                          (a, b, c, expected))
                    input(">>> Wait ...")
                    continue

                # In case of Vietnamese, word analogy can be a phrase
                if " " in a or " " in b or " " in c or " " in expected:
                    is_vn_counter += 1
                    pass
                else:
                    if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                        logger.debug(
                            "SVX: skipping line #%i with OOV words: %s",
                            line_no, line.strip())
                        oov_counter += 1
                        continue

                original_vocab = self.vocab
                self.vocab = ok_vocab
                ignore = {a, b, c}  # input words to be ignored
                predicted = None
                # find the most likely prediction, ignoring OOV words and input words
                sims = most_similar(self,
                                    positive=[b, c],
                                    negative=[a],
                                    topn=False,
                                    restrict_vocab=restrict_vocab)
                self.vocab = original_vocab
                for index in matutils.argsort(sims, reverse=True):
                    predicted = self.index2word[index].upper(
                    ) if case_insensitive else self.index2word[index]
                    if predicted in ok_vocab and predicted not in ignore:
                        if predicted != expected:
                            logger.debug("%s: expected %s, predicted %s",
                                         line.strip(), expected, predicted)
                        break
                if predicted == expected:
                    section['correct'].append((a, b, c, expected))
                else:
                    section['incorrect'].append((a, b, c, expected))

        if section:
            # store the last section, too
            sections.append(section)
            self.log_accuracy(section)

        total = {
            'OOV/Total/VNCompound_Words':
            [oov_counter, (idx_cnt), is_vn_counter],
            'section': 'total',
            'correct': sum((s['correct'] for s in sections), []),
            'incorrect': sum((s['incorrect'] for s in sections), []),
        }
        self.log_accuracy(total)
        sections.append(total)
        return sections
Пример #58
0
 def show_topic(self, topicid, topn=10):
     topic = self.wordtopics[topicid]
     topic = topic / topic.sum()  # normalize to probability dist
     bestn = matutils.argsort(topic, topn, reverse=True)
     beststr = [(topic[id], self.id2word[id]) for id in bestn]
     return beststr
Пример #59
0
    def top_topics(self,
                   corpus,
                   texts=None,
                   dictionary=None,
                   window_size=None,
                   coherence='u_mass',
                   topn=20,
                   processes=-1):
        """Get the topics sorted by coherence.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        texts : list of list of str, optional
            Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
            probability estimator .
        dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Dictionary mapping of id word to create corpus.
            If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
        window_size : int, optional
            Is the size of the window to be used for coherence measures using boolean sliding window as their
            probability estimator. For 'u_mass' this doesn't matter.
            If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
        coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
            Coherence measure to be used.
            Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
            For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
            using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
        topn : int, optional
            Integer corresponding to the number of top words to be extracted from each topic.
        processes : int, optional
            Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
            num_cpus - 1.

        Returns
        -------
        list of (list of (int, str), float)
            Each element in the list is a pair of a topic representation and its coherence score. Topic representations
            are distributions of words, represented as a list of pairs of word IDs and their probabilities.

        """
        cm = CoherenceModel(model=self,
                            corpus=corpus,
                            texts=texts,
                            dictionary=dictionary,
                            window_size=window_size,
                            coherence=coherence,
                            topn=topn,
                            processes=processes)
        coherence_scores = cm.get_coherence_per_topic()

        str_topics = []
        for topic in self.get_topics(
        ):  # topic = array of vocab_size floats, one per term
            bestn = matutils.argsort(topic, topn=topn,
                                     reverse=True)  # top terms for topic
            beststr = [(topic[_id], self.id2word[_id])
                       for _id in bestn]  # membership, token
            str_topics.append(
                beststr)  # list of topn (float membership, token) tuples

        scored_topics = zip(str_topics, coherence_scores)
        return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
Пример #60
0
    def get_topic_extraction_detail(self, message, id):
        #         self.load_lda_topic_model()

        tf_vectorizer = CountVectorizer(max_df=1,
                                        min_df=1,
                                        vocabulary=self.glda_tf_feature_names)

        docs = []

        message = re.sub('\n', ' ', message)
        docs = self.message_corpus(message)

        print('Building BiGrams from the message...')
        bigram = Phrases(docs, min_count=2, threshold=2, delimiter=b' ')
        logger.propagate = False

        bigram_phraser = Phraser(bigram)

        texts = [bigram_phraser[line] for line in docs]

        bg_message = ' '.join(texts[0])

        tf = tf_vectorizer.fit_transform([bg_message])

        print('Extracting topics...')
        doc_topic = self.glda.transform(tf)

        document_topics = [(topicid, topicvalue)
                           for topicid, topicvalue in enumerate(doc_topic[0])
                           if topicvalue >= 0.01]

        document_topics = sorted(document_topics,
                                 key=lambda score: score[1],
                                 reverse=True)

        doc_distribution = np.array([tup[0] for tup in document_topics])

        #         print(doc_distribution)

        print('Extracting term per topic...')
        count_vec = np.asarray(tf.sum(axis=0)).ravel()
        zipped = list(zip(self.glda_tf_feature_names, count_vec))
        x, y = (list(x) for x in zip(
            *sorted(zipped, key=lambda x: x[1], reverse=True)))
        Y = np.concatenate([y[0:tf.indices.shape[0]], y[-1:-1]])
        X = np.concatenate([x[0:tf.indices.shape[0]], x[-1:-1]])
        #         for i in range(len(X)):
        #             print("Top to Bottom Frequent Words : {} , count: {}".format(X.tolist()[i],Y.tolist()[i]))

        self.config_dict = dict(self.config.items('TOPIC_LABEL'))

        list_topic_names = eval(self.config_dict['list_topic_names'])

        doc_term_topic = {"topics": []}
        # print topics with words and score rank
        for i in doc_distribution[-40:][::-1]:
            #             topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
            #             print('Topic {}: {}'.format(i, ' '.join(topic_words)))
            print('Extracting term for topic: ' + repr(i))
            topic_ = self.glda.topic_word_[i]
            topic_ = topic_ / topic_.sum(
            )  # normalize to probability distribution
            bestn = matutils.argsort(topic_, topic_.shape[0], reverse=True)
            topic_ = sorted([(self.glda_tf_feature_names[id], topic_[id])
                             for id in bestn],
                            reverse=True)

            topic_terms = {"terms": []}

            ss = dict((k, 1.0) for k in X)
            test = dict(topic_)
            d = {x: test[x] for x in test if x in ss}

            for term in X:
                topic_id = ''.join(
                    ['{"term":"%s" ,"score": %.6f}' % (term, d[term] * 100)])
                topic_terms["terms"].append(json.loads(topic_id))

#                 print(topic_terms)
#                             for k, v in topic_:
#                                 {k: v for k, v in topic_}
#                             }
#                 }

            topic_terms["terms"] = sorted(topic_terms["terms"],
                                          key=lambda k: k['score'],
                                          reverse=True)
            topic = {
                "score": [score for k, score in document_topics if i == k][0],
                "terms": topic_terms["terms"],
                "topic": list_topic_names[i]
            }
            doc_term_topic["topics"].append(topic)
#             doc_term_topic_score.append(topic_id)
#             topic_ = ' + '.join(['%.4f*"%s"' % (v, k) for k, v in topic_])
#             print(topic)

        print('Sorting topics by score...')
        doc_term_topic["topics"] = sorted(doc_term_topic["topics"],
                                          key=lambda k: k['score'],
                                          reverse=True)

        terms_not_in_topics = json.dumps(list(set(texts[0]).difference(X)))
        doc_term_topic["terms_not_in_topics"] = terms_not_in_topics
        #         print(document_topics)

        print('Done!')
        return doc_term_topic