예제 #1
0
    def evaluate(self, topic_candidates=None, nbtopterms=None):
        """
        evaluate topic coherence. This method is for convenience and actually redundant.
        The coherence scores should optimally be calculated in evaluate_topics.py which provides more
        features and metrics.
        """

        self.logg('evaluating topic candidates')

        # reference scores per topic for top topic terms
        if nbtopterms is None:
            nbtopterms = self.nb_top_terms

        if topic_candidates is None:
            topic_candidates = self.topic_candidates

        topic_candidates = topic_candidates.loc[:, 'term0':
                                                f'term{nbtopterms - 1}']
        topics_list = topic_candidates.values.tolist()

        self.logg('> u_mass')
        t0 = time()
        cm_umass = CoherenceModel(topics=topics_list,
                                  corpus=self.corpus,
                                  dictionary=self.dict_from_corpus,
                                  coherence='u_mass',
                                  topn=nbtopterms,
                                  processes=self.processes)
        umass_scores = cm_umass.get_coherence_per_topic(with_std=False,
                                                        with_support=False)
        t1 = int(time() - t0)
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))

        self.logg('> c_v')
        t0 = time()
        cm_cv = CoherenceModel(topics=topics_list,
                               texts=self.texts,
                               dictionary=self.dict_from_corpus,
                               coherence='c_v',
                               topn=nbtopterms,
                               processes=self.processes)
        cv_scores = cm_cv.get_coherence_per_topic()
        t1 = int(time() - t0)
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))

        # changed segmentation for c_uci and c_npmi from s_one_set to s_one_one (default)
        self.logg('> c_uci')
        t0 = time()
        cm_cuci = CoherenceModel(topics=topics_list,
                                 texts=self.texts,
                                 dictionary=self.dict_from_corpus,
                                 coherence='c_uci',
                                 topn=nbtopterms,
                                 processes=self.processes)
        cuci_scores = cm_cuci.get_coherence_per_topic()
        t1 = int(time() - t0)
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))

        self.logg('> c_npmi')
        t0 = time()
        cm_cuci.coherence = 'c_npmi'  # reusing precalculated probability estimates
        cnpmi_scores1 = cm_cuci.get_coherence_per_topic()
        t1 = int(time() - t0)
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))

        scores = {
            'u_mass_eval': umass_scores,
            'c_v_eval': cv_scores,
            'c_uci_eval': cuci_scores,
            'c_npmi_eval': cnpmi_scores1,
        }
        scores = pd.DataFrame(scores)
        scores.index = topic_candidates.index.copy()
        self.eval_scores = scores
        return scores
예제 #2
0
    def _rerank_coherence_per_metric(self, metric, coherence_model=None):
        """
        Object method to trigger the reranking for a given metric.
        It uses the fast heuristic for the reranking in O(n) with n being the number
        of candidate terms. A coherence metric is applied on each set of topic terms,
        when we leave exactly one term out. The resulting coherence score indicates, if
        a term strengthens or weakens the coherence of a topic. We remove those terms
        from the set whose absence resulted in higher scores.

        :param metric:
        :param coherence_model:
        :return:
        """
        if self.shifted_topics is None:
            self.shifted_topics = self._shift_topics()

        t0 = time()
        self.logg(
            f'Calculating topic candidates using {metric} coherence measure '
            f'on {self.nb_candidate_terms} candidate terms '
            f'for {self.nb_topics} topics')

        # calculate the scores for all shifted topics
        kwargs = dict(topics=self.shifted_topics,
                      dictionary=self.dict_from_corpus,
                      coherence=metric,
                      topn=self.nb_candidate_terms - 1,
                      processes=self.processes)
        if metric == 'u_mass':
            kwargs['corpus'] = self.corpus
        else:
            kwargs['texts'] = self.texts

        if coherence_model is None:
            cm = CoherenceModel(**kwargs)
        else:
            cm = coherence_model
            cm.coherence = metric

        scores1d = cm.get_coherence_per_topic()
        scores2d = np.reshape(scores1d, (self.nb_candidate_terms, -1)).T
        # the highest values indicate the terms whose absence improves the topic coherence most
        sorted_scores = np.argsort(scores2d, axis=1)
        # thus we will keep the first nbtopterms (default 10) indices
        top_scores = sorted_scores[:, :self.nb_top_terms]
        # and sort them back for convenience
        top_scores = np.sort(top_scores, axis=1)
        # replacing indices with token-ids
        tpx_ids = [
            self.topic_ids.values[i, top_scores[i]]
            for i in range(self.nb_topics)
        ]
        tpx_ids = (pd.DataFrame.from_records(
            tpx_ids,
            columns=self.topic_terms.columns[:self.nb_top_terms],
            index=self.topic_ids.index).assign(metric=metric).set_index(
                'metric', append=True))

        t1 = int(time() - t0)
        self._statistics_[metric] = dict()
        self._statistics_[metric]['runtime'] = t1
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))
        return tpx_ids