예제 #1
0
def get_coherence_score_for_each_topic(topics, documents, dictionary, coherence="c_npmi", no_of_words=20):
    """Calculates topic coherence using gensim's coherence pipeline.

    Parameters:

    topics (list of str list): topic words for each topic
    
    documents (list of str): set of documents

    dictionary (gensim.corpora.Dictionary): gensim dicionary of words from dataset

    coherence (str): coherence type. Can be 'c_v', 'u_mass', 'c_uci' or 'c_npmi'

    Returns:

    float: coherence score
    """
    coherence_model = CoherenceModel(
                topics=topics, 
                texts=documents, 
                dictionary=dictionary, 
                coherence=coherence,
                processes=0,
                topn=no_of_words
    )

    return coherence_model.get_coherence_per_topic()
예제 #2
0
    def top_topics(self, corpus, texts=None, dictionary=None, window_size=None,
                   coherence='u_mass', topn=20, processes=-1):
        """Get the topics sorted by coherence.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        texts : list of list of str, optional
            Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
            probability estimator .
        dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Dictionary mapping of id word to create corpus.
            If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
        window_size : int, optional
            Is the size of the window to be used for coherence measures using boolean sliding window as their
            probability estimator. For 'u_mass' this doesn't matter.
            If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
        coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
            Coherence measure to be used.
            Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
            For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
            using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
        topn : int, optional
            Integer corresponding to the number of top words to be extracted from each topic.
        processes : int, optional
            Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
            num_cpus - 1.

        Returns
        -------
        list of (list of (int, str), float)
            Each element in the list is a pair of a topic representation and its coherence score. Topic representations
            are distributions of words, represented as a list of pairs of word IDs and their probabilities.

        """
        cm = CoherenceModel(
            model=self, corpus=corpus, texts=texts, dictionary=dictionary,
            window_size=window_size, coherence=coherence, topn=topn,
            processes=processes
        )
        coherence_scores = cm.get_coherence_per_topic()

        str_topics = []
        for topic in self.get_topics():  # topic = array of vocab_size floats, one per term
            bestn = matutils.argsort(topic, topn=topn, reverse=True)  # top terms for topic
            beststr = [(topic[_id], self.id2word[_id]) for _id in bestn]  # membership, token
            str_topics.append(beststr)  # list of topn (float membership, token) tuples

        scored_topics = zip(str_topics, coherence_scores)
        return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
예제 #3
0
    def top_topics(self, corpus, texts=None, dictionary=None, window_size=None,
                   coherence='u_mass', topn=20, processes=-1):
        """Get the topics sorted by coherence.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        texts : list of list of str, optional
            Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`)
            probability estimator .
        dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Dictionary mapping of id word to create corpus.
            If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used.
        window_size : int, optional
            Is the size of the window to be used for coherence measures using boolean sliding window as their
            probability estimator. For 'u_mass' this doesn't matter.
            If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10.
        coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional
            Coherence measure to be used.
            Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`.
            For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus
            using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed)
        topn : int, optional
            Integer corresponding to the number of top words to be extracted from each topic.
        processes : int, optional
            Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as
            num_cpus - 1.

        Returns
        -------
        list of (list of (int, str), float)
            Each element in the list is a pair of a topic representation and its coherence score. Topic representations
            are distributions of words, represented as a list of pairs of word IDs and their probabilities.

        """
        cm = CoherenceModel(
            model=self, corpus=corpus, texts=texts, dictionary=dictionary,
            window_size=window_size, coherence=coherence, topn=topn,
            processes=processes
        )
        coherence_scores = cm.get_coherence_per_topic()

        str_topics = []
        for topic in self.get_topics():  # topic = array of vocab_size floats, one per term
            bestn = matutils.argsort(topic, topn=topn, reverse=True)  # top terms for topic
            beststr = [(topic[_id], self.id2word[_id]) for _id in bestn]  # membership, token
            str_topics.append(beststr)  # list of topn (float membership, token) tuples

        scored_topics = zip(str_topics, coherence_scores)
        return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
예제 #4
0
def compute_coherence_values(dictionary,
                             corpus,
                             texts,
                             limit,
                             start=2,
                             step=3,
                             typ='original'):
    """
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics
    """
    coherence_values = []
    model_list = []
    topics_lis = []
    for num_topics in tqdm(range(start, limit, step)):

        if typ == 'mallet':
            # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
            mallet_path = 'mallet-2.0.8/bin/mallet'
            model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                     corpus=corpus,
                                                     num_topics=num_topics,
                                                     iterations=500,
                                                     id2word=dictionary)
        if typ == 'original':
            model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                    id2word=dictionary,
                                                    num_topics=num_topics,
                                                    update_every=1,
                                                    chunksize=100,
                                                    passes=10,
                                                    alpha='auto',
                                                    iterations=500,
                                                    per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        topics_lis.append(coherencemodel.get_coherence_per_topic())

    return model_list, coherence_values, topics_lis
예제 #5
0
def get_coherence_per_topic(topiclist, token_version, coherence_model):
    idx = 0
    slice_len = 2000
    coherences = []
    wiki_raw_filename = 'enwiki-20180920-pages-articles1.xml-p10p30302.bz2'
    dictionary = get_dict(wiki_raw_filename, token_version)
    wiki_corpus = get_corpus(wiki_raw_filename, token_version, dictionary)
    nbprint('Computing Coherence')
    while idx < len(topiclist):
        nbprint('Slice {}-{} of {}'.format(idx, idx + slice_len - 1,
                                           len(topiclist)))
        topiclist_slice = topiclist[idx:idx + slice_len]
        topiclist_reduced = filter_tokens(topiclist_slice, dictionary)
        cm = CoherenceModel(topics=topiclist_reduced,
                            texts=wiki_corpus.get_texts(),
                            dictionary=dictionary,
                            coherence=coherence_model)
        coherences += cm.get_coherence_per_topic()
        idx += slice_len
    return coherences
class TopicCoherence:
    def __init__(self, config):
        if type(config) == LDAConfig:
            self.model = LDAModel(config)
        elif type(config) == LSIConfig:
            self.model = LSIModel(config)

        # config_file = "../configs/alexa_lsi_config.json"
        # config = LSIConfig.from_json_file(config_file)
        # model = LSIModel(config=config, build=False)

        self.dictionary = self.model.get_dictionary()
        temp = self.dictionary[0]  # This is only to "load" the dictionary.
        self.cm = CoherenceModel(model=self.model.get_model(),
                                 texts=self.model.get_docs(),
                                 dictionary=self.dictionary,
                                 coherence="c_w2v")

    def get_coherence(self, doc):
        tokens = self.model.tokenizer.tokenize(doc)
        # todo try truncated version of doc_ids, those that are in truncated dictionary
        doc_ids = []
        for token in tokens:
            try:
                tid = self.dictionary.token2id[token]
            except:
                logging.debug("Unknown token: " + str(token))
                continue
            doc_ids.append(tid)

        doc_ids = [np.array(doc_ids)]
        #doc_ids = [np.array([self.dictionary.token2id[token] for token in tokens])]

        segmented_doc = segmentation.s_one_set(doc_ids)

        doc_coherence = self.cm.get_coherence_per_topic(segmented_doc)[0]
        return doc_coherence
예제 #7
0
def calculate_coherence(args, save=True):
    topic_dir = Path(args.input_dir)
    parent_dir = topic_dir.parent
    config = load_yaml(parent_dir / "config.yml")
    try:
        data_dir = config["input_dir"]  # dvae, mallet
    except KeyError:
        data_dir = str(Path(config["data_path"]).parent)  # etm

    #### quick HACK to handle scratch directories, needs cleanup ###
    processed_name = Path(data_dir).name
    data_dir_map = {
        f"/workspace/topic-preprocessing/data/nytimes/processed/{processed_name}":
        f"/scratch/{processed_name}/nytimes",
        f"/workspace/topic-preprocessing/data/wikitext/processed/{processed_name}":
        f"/scratch/{processed_name}/wikitext",
        f"/workspace/topic-preprocessing/data/bbc/processed/{processed_name}":
        f"/scratch/{processed_name}/bbc",
    }

    # for out-of-sample coherence
    ref_corpus = args.reference_corpus
    if ref_corpus == "wikitext_full" or ref_corpus == "nytimes_full":
        try:
            data_dict = Dictionary.load(str(Path(data_dir, "train-dict.npy")))
        except FileNotFoundError:
            data_dict = make_dictionary(data_dir)

        if ref_corpus == "wikitext_full":
            mapped_dir = f"/workspace/topic-preprocessing/data/wikitext/processed/{processed_name}"
        if ref_corpus == "nytimes_full":
            mapped_dir = f"/workspace/topic-preprocessing/data/nytimes/processed/{processed_name}"
        ref_corpus_fname = "full.txt"
    # standard coherence
    else:
        ref_corpus = args.reference_corpus
        ref_corpus_fname = f"{ref_corpus}.txt"  # can later update to external if needed
        mapped_dir = Path(data_dir_map[data_dir])

        if Path(mapped_dir, "train-dict.npy").exists() and Path(
                mapped_dir, ref_corpus_fname).exists():
            print("reading files from scratch", flush=True)
            data_dict = Dictionary.load(str(Path(mapped_dir,
                                                 "train-dict.npy")))
        else:
            print("loading files", flush=True)
            try:
                data_dict = Dictionary.load(
                    str(Path(data_dir, "train-dict.npy")))
            except FileNotFoundError:
                data_dict = make_dictionary(data_dir)

            # copy to scratch directory
            print("copying files to scratch", flush=True)
            mapped_dir.mkdir(exist_ok=True, parents=True)
            shutil.copy(Path(data_dir, ref_corpus_fname),
                        Path(mapped_dir, ref_corpus_fname))
            shutil.copy(Path(data_dir, "train-dict.npy"),
                        Path(mapped_dir, "train-dict.npy"))

    ### end hack ###

    topic_sets = collect_topics(
        topic_dir=topic_dir,
        start_at=args.start_at,
        eval_every_n=args.eval_every_n,
        eval_last_only=args.eval_last_only,
    )

    measure_name = gen_measure_name(args.coherence_measure, args.window_size,
                                    args.reference_corpus, args.top_n)
    coherence_results = {measure_name: {}}

    print("calculating coherence...", flush=True)
    for idx, path, topics in topic_sets:
        topics = [t[:args.top_n] for t in topics]
        reference_text = load_tokens(Path(mapped_dir, ref_corpus_fname))

        cm = CoherenceModel(
            topics=topics,
            texts=reference_text,
            dictionary=data_dict,
            coherence=args.coherence_measure,
            window_size=args.window_size,
        )
        confirmed_measures = cm.get_coherence_per_topic()
        mean = cm.aggregate_measures(confirmed_measures)
        coherence_results[measure_name][idx] = {
            "aggregate": float(mean),
            "by_topic": [float(i) for i in confirmed_measures
                         ],  # needs to be python float to json-serialize
            "path": str(path),
        }
    if not save:
        return coherence_results

    output_dir = parent_dir / "coherences.json"
    if output_dir.exists(
    ):  # TODO: currently broken, will overwrite different epochs
        prev_coherence = load_json(output_dir)
        prev_coherence.update(**coherence_results)
        coherence_results = prev_coherence

    save_json(coherence_results, parent_dir / "coherences.json")
    print("done!")
    return coherence_results
예제 #8
0
    def perform_topic_modeling(self, input_local_root, files, titles, converted_local_root,
                               output_dir, pyLDAvis_output_file, th_output_dir, th_pyLDAvis_output_file,
                               max_no_topic = 10, is_short_words_removed = True):

        print("========== PART 1 : Input Files ==========")
        data = Util.filter_file_to_read(input_local_root, files, converted_local_root)
        num_doc = len(titles)

        print("========== PART 2 : Data Preparation and Creating Word Tokenization ==========")
        # Set data into dataframe type
        data_df = self.to_dataframe(data, titles)
        data_df.head()

        inp_list = []
        for num in range(num_doc):
            content = data_df['content'][num]
            inp_list.append(TextPreProcessing.split_word(content))

        counter = 0
        for word in inp_list:
            counter += len(word)
        print("Unique words in this processing corpus: {0}".format(counter))

        # Create dictionary, corpus and corpus TFIDF
        # Turn tokenized documents into a id <-> term dictionary
        dictionary = corpora.Dictionary(inp_list)
        dict2 = {dictionary[ID]:ID for ID in dictionary.keys()}

        # Convert tokenized documents into a document-term matrix
        corpus = [dictionary.doc2bow(text) for text in inp_list]
        tfidf = models.TfidfModel(corpus, smartirs='ntc')
        corpus_tfidf = tfidf[corpus]

        if is_short_words_removed:
            # Remove character number is less than 2 words off
            new_lists = TextPreProcessing.cut_character(inp_list, self.num_cut)
        else:
            new_lists = inp_list

        # Remove word is not noun and prop noun by pos_tag function
        for num in range(num_doc):
            new_lists[num] = TextPreProcessing.postag(new_lists[num])

        # Create new dict and corpus
        dictionary2 = corpora.Dictionary(new_lists)
        dict_2 = {dictionary2[ID]:ID for ID in dictionary2.keys()}
        corpus2 = [dictionary2.doc2bow(text) for text in new_lists]

        # Header Title plus frequency in corpus
        corpus2 = TextPreProcessing.add_frequency(dict_2, corpus2, data_df, 10, num_doc)

        print("========== PART 3 : Generate LDA Model ==========")
        # Generate LDA Model

        # Default number of topic is 10. If the number of documents is fewer than the maximum number of topics, the number of documents will be used to as the maximum number of topics.
        max_no_topic = min([max_no_topic, num_doc])
        if max_no_topic < 2:
            max_no_topic = 2

        ldamodel = self.LDAmodel(dictionary2, corpus2, max_no_topic)
        term_dist_topic = ldamodel.show_topics(max_no_topic, 1000, log=True, formatted=False)
        # print(term_dist_topic)
        handle1=open('term_dist_topic','a+')
        handle1.write(str(term_dist_topic))
        handle1.write("\n")
        handle1.close()

        print("========== PART 4 : Topic-term distribution ==========")
        ### Topic-Term Dist
        topic_term_dist = []
        # print(dictionary2['ทุจริต'])
        # print(dictionary2)
        topic_term_dist = TextDistribution.topicTerm_dist(dict_2,corpus2,topic_term_dist, term_dist_topic)
        print(topic_term_dist)
        handle1=open('topic_term_dist','a+')
        handle1.write(str(topic_term_dist))
        handle1.write("\n")
        handle1.close()
        print("========== PART 4-1 : Document-topic (all) distribution ==========")
        ### Doc_topic_all_dist
        doc_topic_dist = []
        doc_topic_dist = TextDistribution.docTopic_dist(doc_topic_dist, data_df, num_doc, inp_list,dictionary2,ldamodel)
        print(doc_topic_dist)

        print("========== PART 4-2 : Document-topic (min) distribution ==========")
        ### Doc_topic_min_dist
        n_doc_intopic = []
        n_doc_intopic = TextDistribution.Ndoc_topic(n_doc_intopic,num_doc, data_df, inp_list, dictionary2, ldamodel)
        print(n_doc_intopic)

        print("========== PART 5 : Evaluate Model ==========")
        # Evaluate
        lda_coherence = CoherenceModel(ldamodel, corpus=corpus2, dictionary=dictionary2, coherence='u_mass')
        print(lda_coherence.get_coherence_per_topic())
        print("LDA umass score = %.4f" % (lda_coherence.get_coherence()))

        lda_coherence = CoherenceModel(ldamodel, texts=new_lists, dictionary=dictionary2, coherence='c_uci')
        print("LDA uci score = %.4f" % (lda_coherence.get_coherence()))

        print("========== PART 6 : Export pyLDAvis HTML ==========")
        # pyLDAvis.enable_notebook()
        vis = pyLDAvis.gensim.prepare(ldamodel, corpus2, dictionary=ldamodel.id2word)
        pyLDAvis.save_html(vis, output_dir + pyLDAvis_output_file)

        print("========== PART 7 : Convert pyLDAvis HTML to Thai==========")
        self.localize_pyLDAvis_to_thai(output_dir, pyLDAvis_output_file, th_output_dir, th_pyLDAvis_output_file)
예제 #9
0
print(coherence_model_1train_get)

# In[17]:

# calculate coherence metric for test_set
coherence_model_1test = CoherenceModel(model=model_1,
                                       texts=processed_docs_1000test,
                                       dictionary=id_to_word_1000test,
                                       coherence='c_v')
coherence_model_1test_get = coherence_model_1test.get_coherence()
print(coherence_model_1test_get)

# In[18]:

# calculate coherence metric for each of n topics in test set
coherence_model_1_per_topic = coherence_model_1test.get_coherence_per_topic()

# uncomment to print coherence_model_1_per_topic
# print(coherence_model_1_per_topic)

# #### Model #1 - Evaluate - Perplexity
# Calculate perplexity metric. Metric calculates and returns per-word likelihood bound using a chunk of documents as evaluation corpus. Output calculated statistics, including the perplexity=2^(-bound), to log at INFO level. Returns the variational bound score calculated for each word

# In[19]:

# calculate perplexity metric for model_1 train set (1000 pats dataset)
perplexity_model_1train = model_1.log_perplexity(corpus_1000train)
print(perplexity_model_1train)

# In[20]:
예제 #10
0
# In[92]:


# TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param
# calculate coherence metric
coherence = CoherenceModel(model=model_lda, texts=filtered_data, dictionary=id_to_word, coherence='c_v')
coherence_1 = coherence.get_coherence()
coherence_1


# In[94]:


# calculate coherence metric or each of the n topicss
coherence_1 = coherence.get_coherence_per_topic()
coherence_1


# In[97]:


# explore topics
pyLDAvis.enable_notebook()
viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus_train, id_to_word)
viz_topics_1
# TODO (Lee) - salient vs relevant terms in pyLDA ?


# ### Model 2-  Mallet model
def compute_coherence_score(lda_model,reviews):
    coherence = CoherenceModel(lda_model,texts = reviews,dictionary = dictionary ,coherence = "c_v")
    return coherence.get_coherence(),coherence.get_coherence_per_topic()
예제 #12
0
    def evaluate(self, topic_candidates=None, nbtopterms=None):
        """
        evaluate topic coherence. This method is for convenience and actually redundant.
        The coherence scores should optimally be calculated in evaluate_topics.py which provides more
        features and metrics.
        """

        self.logg('evaluating topic candidates')

        # reference scores per topic for top topic terms
        if nbtopterms is None:
            nbtopterms = self.nb_top_terms

        if topic_candidates is None:
            topic_candidates = self.topic_candidates

        topic_candidates = topic_candidates.loc[:, 'term0':
                                                f'term{nbtopterms - 1}']
        topics_list = topic_candidates.values.tolist()

        self.logg('> u_mass')
        t0 = time()
        cm_umass = CoherenceModel(topics=topics_list,
                                  corpus=self.corpus,
                                  dictionary=self.dict_from_corpus,
                                  coherence='u_mass',
                                  topn=nbtopterms,
                                  processes=self.processes)
        umass_scores = cm_umass.get_coherence_per_topic(with_std=False,
                                                        with_support=False)
        t1 = int(time() - t0)
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))

        self.logg('> c_v')
        t0 = time()
        cm_cv = CoherenceModel(topics=topics_list,
                               texts=self.texts,
                               dictionary=self.dict_from_corpus,
                               coherence='c_v',
                               topn=nbtopterms,
                               processes=self.processes)
        cv_scores = cm_cv.get_coherence_per_topic()
        t1 = int(time() - t0)
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))

        # changed segmentation for c_uci and c_npmi from s_one_set to s_one_one (default)
        self.logg('> c_uci')
        t0 = time()
        cm_cuci = CoherenceModel(topics=topics_list,
                                 texts=self.texts,
                                 dictionary=self.dict_from_corpus,
                                 coherence='c_uci',
                                 topn=nbtopterms,
                                 processes=self.processes)
        cuci_scores = cm_cuci.get_coherence_per_topic()
        t1 = int(time() - t0)
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))

        self.logg('> c_npmi')
        t0 = time()
        cm_cuci.coherence = 'c_npmi'  # reusing precalculated probability estimates
        cnpmi_scores1 = cm_cuci.get_coherence_per_topic()
        t1 = int(time() - t0)
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))

        scores = {
            'u_mass_eval': umass_scores,
            'c_v_eval': cv_scores,
            'c_uci_eval': cuci_scores,
            'c_npmi_eval': cnpmi_scores1,
        }
        scores = pd.DataFrame(scores)
        scores.index = topic_candidates.index.copy()
        self.eval_scores = scores
        return scores
예제 #13
0
    def _rerank_coherence_per_metric(self, metric, coherence_model=None):
        """
        Object method to trigger the reranking for a given metric.
        It uses the fast heuristic for the reranking in O(n) with n being the number
        of candidate terms. A coherence metric is applied on each set of topic terms,
        when we leave exactly one term out. The resulting coherence score indicates, if
        a term strengthens or weakens the coherence of a topic. We remove those terms
        from the set whose absence resulted in higher scores.

        :param metric:
        :param coherence_model:
        :return:
        """
        if self.shifted_topics is None:
            self.shifted_topics = self._shift_topics()

        t0 = time()
        self.logg(
            f'Calculating topic candidates using {metric} coherence measure '
            f'on {self.nb_candidate_terms} candidate terms '
            f'for {self.nb_topics} topics')

        # calculate the scores for all shifted topics
        kwargs = dict(topics=self.shifted_topics,
                      dictionary=self.dict_from_corpus,
                      coherence=metric,
                      topn=self.nb_candidate_terms - 1,
                      processes=self.processes)
        if metric == 'u_mass':
            kwargs['corpus'] = self.corpus
        else:
            kwargs['texts'] = self.texts

        if coherence_model is None:
            cm = CoherenceModel(**kwargs)
        else:
            cm = coherence_model
            cm.coherence = metric

        scores1d = cm.get_coherence_per_topic()
        scores2d = np.reshape(scores1d, (self.nb_candidate_terms, -1)).T
        # the highest values indicate the terms whose absence improves the topic coherence most
        sorted_scores = np.argsort(scores2d, axis=1)
        # thus we will keep the first nbtopterms (default 10) indices
        top_scores = sorted_scores[:, :self.nb_top_terms]
        # and sort them back for convenience
        top_scores = np.sort(top_scores, axis=1)
        # replacing indices with token-ids
        tpx_ids = [
            self.topic_ids.values[i, top_scores[i]]
            for i in range(self.nb_topics)
        ]
        tpx_ids = (pd.DataFrame.from_records(
            tpx_ids,
            columns=self.topic_terms.columns[:self.nb_top_terms],
            index=self.topic_ids.index).assign(metric=metric).set_index(
                'metric', append=True))

        t1 = int(time() - t0)
        self._statistics_[metric] = dict()
        self._statistics_[metric]['runtime'] = t1
        self.logg("    done in {:02d}:{:02d}:{:02d}".format(
            t1 // 3600, (t1 // 60) % 60, t1 % 60))
        return tpx_ids
예제 #14
0
def eval_coherence(topics,
                   dictionary,
                   corpus=None,
                   texts=None,
                   keyed_vectors=None,
                   metrics=None,
                   window_size=None,
                   suffix='',
                   cores=1,
                   logg=print,
                   topn=10):
    if not (corpus or texts or keyed_vectors):
        logg('provide corpus, texts and/or keyed_vectors')
        return
    if metrics is None:
        if corpus is not None:
            metrics = ['u_mass']
        if texts is not None:
            if metrics is None:
                metrics = ['c_v', 'c_npmi', 'c_uci']
            else:
                metrics += ['c_v', 'c_npmi', 'c_uci']
        if keyed_vectors is not None:
            if metrics is None:
                metrics = ['c_w2v']
            else:
                metrics += ['c_w2v']

    # add out of vocabulariy terms dictionary and documents
    in_dict = topics.applymap(lambda x: x in dictionary.token2id)
    oov = topics[~in_dict]
    oov = oov.apply(set)
    oov = set().union(*oov)
    isstr = lambda x: isinstance(x, str)
    tolist = lambda x: [x]
    oov = sorted(map(tolist, filter(isstr, oov)))
    logg(f'OOV: {oov}')
    if oov:
        dictionary.add_documents(oov, prune_at=None)
        _ = dictionary[0]

    scores = dict()
    topics_values = topics.values
    for metric in metrics:
        t0 = time()
        gc.collect()
        logg(metric)
        txt = texts + oov if texts else None
        cm = CoherenceModel(topics=topics_values,
                            dictionary=dictionary,
                            corpus=corpus,
                            texts=txt,
                            coherence=metric,
                            topn=topn,
                            window_size=window_size,
                            processes=cores,
                            keyed_vectors=keyed_vectors)
        coherence_scores = cm.get_coherence_per_topic(with_std=True,
                                                      with_support=True)
        scores[metric + suffix] = coherence_scores
        gc.collect()
        t1 = int(time() - t0)
        logg("    done in {:02d}:{:02d}:{:02d}".format(t1 // 3600,
                                                       (t1 // 60) % 60,
                                                       t1 % 60))

    df = pd.DataFrame(scores)
    df.index = topics.index
    gc.collect()
    return df