def get_coherence_score_for_each_topic(topics, documents, dictionary, coherence="c_npmi", no_of_words=20): """Calculates topic coherence using gensim's coherence pipeline. Parameters: topics (list of str list): topic words for each topic documents (list of str): set of documents dictionary (gensim.corpora.Dictionary): gensim dicionary of words from dataset coherence (str): coherence type. Can be 'c_v', 'u_mass', 'c_uci' or 'c_npmi' Returns: float: coherence score """ coherence_model = CoherenceModel( topics=topics, texts=documents, dictionary=dictionary, coherence=coherence, processes=0, topn=no_of_words ) return coherence_model.get_coherence_per_topic()
def top_topics(self, corpus, texts=None, dictionary=None, window_size=None, coherence='u_mass', topn=20, processes=-1): """Get the topics sorted by coherence. Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents) Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). texts : list of list of str, optional Tokenized texts, needed for coherence models that use sliding window based (i.e. coherence=`c_something`) probability estimator . dictionary : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional Dictionary mapping of id word to create corpus. If `model.id2word` is present, this is not needed. If both are provided, passed `dictionary` will be used. window_size : int, optional Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For 'u_mass' this doesn't matter. If None - the default window sizes are used which are: 'c_v' - 110, 'c_uci' - 10, 'c_npmi' - 10. coherence : {'u_mass', 'c_v', 'c_uci', 'c_npmi'}, optional Coherence measure to be used. Fastest method - 'u_mass', 'c_uci' also known as `c_pmi`. For 'u_mass' corpus should be provided, if texts is provided, it will be converted to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' `texts` should be provided (`corpus` isn't needed) topn : int, optional Integer corresponding to the number of top words to be extracted from each topic. processes : int, optional Number of processes to use for probability estimation phase, any value less than 1 will be interpreted as num_cpus - 1. Returns ------- list of (list of (int, str), float) Each element in the list is a pair of a topic representation and its coherence score. Topic representations are distributions of words, represented as a list of pairs of word IDs and their probabilities. """ cm = CoherenceModel( model=self, corpus=corpus, texts=texts, dictionary=dictionary, window_size=window_size, coherence=coherence, topn=topn, processes=processes ) coherence_scores = cm.get_coherence_per_topic() str_topics = [] for topic in self.get_topics(): # topic = array of vocab_size floats, one per term bestn = matutils.argsort(topic, topn=topn, reverse=True) # top terms for topic beststr = [(topic[_id], self.id2word[_id]) for _id in bestn] # membership, token str_topics.append(beststr) # list of topn (float membership, token) tuples scored_topics = zip(str_topics, coherence_scores) return sorted(scored_topics, key=lambda tup: tup[1], reverse=True)
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3, typ='original'): """ dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics """ coherence_values = [] model_list = [] topics_lis = [] for num_topics in tqdm(range(start, limit, step)): if typ == 'mallet': # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip mallet_path = 'mallet-2.0.8/bin/mallet' model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, iterations=500, id2word=dictionary) if typ == 'original': model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, update_every=1, chunksize=100, passes=10, alpha='auto', iterations=500, per_word_topics=True) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) topics_lis.append(coherencemodel.get_coherence_per_topic()) return model_list, coherence_values, topics_lis
def get_coherence_per_topic(topiclist, token_version, coherence_model): idx = 0 slice_len = 2000 coherences = [] wiki_raw_filename = 'enwiki-20180920-pages-articles1.xml-p10p30302.bz2' dictionary = get_dict(wiki_raw_filename, token_version) wiki_corpus = get_corpus(wiki_raw_filename, token_version, dictionary) nbprint('Computing Coherence') while idx < len(topiclist): nbprint('Slice {}-{} of {}'.format(idx, idx + slice_len - 1, len(topiclist))) topiclist_slice = topiclist[idx:idx + slice_len] topiclist_reduced = filter_tokens(topiclist_slice, dictionary) cm = CoherenceModel(topics=topiclist_reduced, texts=wiki_corpus.get_texts(), dictionary=dictionary, coherence=coherence_model) coherences += cm.get_coherence_per_topic() idx += slice_len return coherences
class TopicCoherence: def __init__(self, config): if type(config) == LDAConfig: self.model = LDAModel(config) elif type(config) == LSIConfig: self.model = LSIModel(config) # config_file = "../configs/alexa_lsi_config.json" # config = LSIConfig.from_json_file(config_file) # model = LSIModel(config=config, build=False) self.dictionary = self.model.get_dictionary() temp = self.dictionary[0] # This is only to "load" the dictionary. self.cm = CoherenceModel(model=self.model.get_model(), texts=self.model.get_docs(), dictionary=self.dictionary, coherence="c_w2v") def get_coherence(self, doc): tokens = self.model.tokenizer.tokenize(doc) # todo try truncated version of doc_ids, those that are in truncated dictionary doc_ids = [] for token in tokens: try: tid = self.dictionary.token2id[token] except: logging.debug("Unknown token: " + str(token)) continue doc_ids.append(tid) doc_ids = [np.array(doc_ids)] #doc_ids = [np.array([self.dictionary.token2id[token] for token in tokens])] segmented_doc = segmentation.s_one_set(doc_ids) doc_coherence = self.cm.get_coherence_per_topic(segmented_doc)[0] return doc_coherence
def calculate_coherence(args, save=True): topic_dir = Path(args.input_dir) parent_dir = topic_dir.parent config = load_yaml(parent_dir / "config.yml") try: data_dir = config["input_dir"] # dvae, mallet except KeyError: data_dir = str(Path(config["data_path"]).parent) # etm #### quick HACK to handle scratch directories, needs cleanup ### processed_name = Path(data_dir).name data_dir_map = { f"/workspace/topic-preprocessing/data/nytimes/processed/{processed_name}": f"/scratch/{processed_name}/nytimes", f"/workspace/topic-preprocessing/data/wikitext/processed/{processed_name}": f"/scratch/{processed_name}/wikitext", f"/workspace/topic-preprocessing/data/bbc/processed/{processed_name}": f"/scratch/{processed_name}/bbc", } # for out-of-sample coherence ref_corpus = args.reference_corpus if ref_corpus == "wikitext_full" or ref_corpus == "nytimes_full": try: data_dict = Dictionary.load(str(Path(data_dir, "train-dict.npy"))) except FileNotFoundError: data_dict = make_dictionary(data_dir) if ref_corpus == "wikitext_full": mapped_dir = f"/workspace/topic-preprocessing/data/wikitext/processed/{processed_name}" if ref_corpus == "nytimes_full": mapped_dir = f"/workspace/topic-preprocessing/data/nytimes/processed/{processed_name}" ref_corpus_fname = "full.txt" # standard coherence else: ref_corpus = args.reference_corpus ref_corpus_fname = f"{ref_corpus}.txt" # can later update to external if needed mapped_dir = Path(data_dir_map[data_dir]) if Path(mapped_dir, "train-dict.npy").exists() and Path( mapped_dir, ref_corpus_fname).exists(): print("reading files from scratch", flush=True) data_dict = Dictionary.load(str(Path(mapped_dir, "train-dict.npy"))) else: print("loading files", flush=True) try: data_dict = Dictionary.load( str(Path(data_dir, "train-dict.npy"))) except FileNotFoundError: data_dict = make_dictionary(data_dir) # copy to scratch directory print("copying files to scratch", flush=True) mapped_dir.mkdir(exist_ok=True, parents=True) shutil.copy(Path(data_dir, ref_corpus_fname), Path(mapped_dir, ref_corpus_fname)) shutil.copy(Path(data_dir, "train-dict.npy"), Path(mapped_dir, "train-dict.npy")) ### end hack ### topic_sets = collect_topics( topic_dir=topic_dir, start_at=args.start_at, eval_every_n=args.eval_every_n, eval_last_only=args.eval_last_only, ) measure_name = gen_measure_name(args.coherence_measure, args.window_size, args.reference_corpus, args.top_n) coherence_results = {measure_name: {}} print("calculating coherence...", flush=True) for idx, path, topics in topic_sets: topics = [t[:args.top_n] for t in topics] reference_text = load_tokens(Path(mapped_dir, ref_corpus_fname)) cm = CoherenceModel( topics=topics, texts=reference_text, dictionary=data_dict, coherence=args.coherence_measure, window_size=args.window_size, ) confirmed_measures = cm.get_coherence_per_topic() mean = cm.aggregate_measures(confirmed_measures) coherence_results[measure_name][idx] = { "aggregate": float(mean), "by_topic": [float(i) for i in confirmed_measures ], # needs to be python float to json-serialize "path": str(path), } if not save: return coherence_results output_dir = parent_dir / "coherences.json" if output_dir.exists( ): # TODO: currently broken, will overwrite different epochs prev_coherence = load_json(output_dir) prev_coherence.update(**coherence_results) coherence_results = prev_coherence save_json(coherence_results, parent_dir / "coherences.json") print("done!") return coherence_results
def perform_topic_modeling(self, input_local_root, files, titles, converted_local_root, output_dir, pyLDAvis_output_file, th_output_dir, th_pyLDAvis_output_file, max_no_topic = 10, is_short_words_removed = True): print("========== PART 1 : Input Files ==========") data = Util.filter_file_to_read(input_local_root, files, converted_local_root) num_doc = len(titles) print("========== PART 2 : Data Preparation and Creating Word Tokenization ==========") # Set data into dataframe type data_df = self.to_dataframe(data, titles) data_df.head() inp_list = [] for num in range(num_doc): content = data_df['content'][num] inp_list.append(TextPreProcessing.split_word(content)) counter = 0 for word in inp_list: counter += len(word) print("Unique words in this processing corpus: {0}".format(counter)) # Create dictionary, corpus and corpus TFIDF # Turn tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(inp_list) dict2 = {dictionary[ID]:ID for ID in dictionary.keys()} # Convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in inp_list] tfidf = models.TfidfModel(corpus, smartirs='ntc') corpus_tfidf = tfidf[corpus] if is_short_words_removed: # Remove character number is less than 2 words off new_lists = TextPreProcessing.cut_character(inp_list, self.num_cut) else: new_lists = inp_list # Remove word is not noun and prop noun by pos_tag function for num in range(num_doc): new_lists[num] = TextPreProcessing.postag(new_lists[num]) # Create new dict and corpus dictionary2 = corpora.Dictionary(new_lists) dict_2 = {dictionary2[ID]:ID for ID in dictionary2.keys()} corpus2 = [dictionary2.doc2bow(text) for text in new_lists] # Header Title plus frequency in corpus corpus2 = TextPreProcessing.add_frequency(dict_2, corpus2, data_df, 10, num_doc) print("========== PART 3 : Generate LDA Model ==========") # Generate LDA Model # Default number of topic is 10. If the number of documents is fewer than the maximum number of topics, the number of documents will be used to as the maximum number of topics. max_no_topic = min([max_no_topic, num_doc]) if max_no_topic < 2: max_no_topic = 2 ldamodel = self.LDAmodel(dictionary2, corpus2, max_no_topic) term_dist_topic = ldamodel.show_topics(max_no_topic, 1000, log=True, formatted=False) # print(term_dist_topic) handle1=open('term_dist_topic','a+') handle1.write(str(term_dist_topic)) handle1.write("\n") handle1.close() print("========== PART 4 : Topic-term distribution ==========") ### Topic-Term Dist topic_term_dist = [] # print(dictionary2['ทุจริต']) # print(dictionary2) topic_term_dist = TextDistribution.topicTerm_dist(dict_2,corpus2,topic_term_dist, term_dist_topic) print(topic_term_dist) handle1=open('topic_term_dist','a+') handle1.write(str(topic_term_dist)) handle1.write("\n") handle1.close() print("========== PART 4-1 : Document-topic (all) distribution ==========") ### Doc_topic_all_dist doc_topic_dist = [] doc_topic_dist = TextDistribution.docTopic_dist(doc_topic_dist, data_df, num_doc, inp_list,dictionary2,ldamodel) print(doc_topic_dist) print("========== PART 4-2 : Document-topic (min) distribution ==========") ### Doc_topic_min_dist n_doc_intopic = [] n_doc_intopic = TextDistribution.Ndoc_topic(n_doc_intopic,num_doc, data_df, inp_list, dictionary2, ldamodel) print(n_doc_intopic) print("========== PART 5 : Evaluate Model ==========") # Evaluate lda_coherence = CoherenceModel(ldamodel, corpus=corpus2, dictionary=dictionary2, coherence='u_mass') print(lda_coherence.get_coherence_per_topic()) print("LDA umass score = %.4f" % (lda_coherence.get_coherence())) lda_coherence = CoherenceModel(ldamodel, texts=new_lists, dictionary=dictionary2, coherence='c_uci') print("LDA uci score = %.4f" % (lda_coherence.get_coherence())) print("========== PART 6 : Export pyLDAvis HTML ==========") # pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(ldamodel, corpus2, dictionary=ldamodel.id2word) pyLDAvis.save_html(vis, output_dir + pyLDAvis_output_file) print("========== PART 7 : Convert pyLDAvis HTML to Thai==========") self.localize_pyLDAvis_to_thai(output_dir, pyLDAvis_output_file, th_output_dir, th_pyLDAvis_output_file)
print(coherence_model_1train_get) # In[17]: # calculate coherence metric for test_set coherence_model_1test = CoherenceModel(model=model_1, texts=processed_docs_1000test, dictionary=id_to_word_1000test, coherence='c_v') coherence_model_1test_get = coherence_model_1test.get_coherence() print(coherence_model_1test_get) # In[18]: # calculate coherence metric for each of n topics in test set coherence_model_1_per_topic = coherence_model_1test.get_coherence_per_topic() # uncomment to print coherence_model_1_per_topic # print(coherence_model_1_per_topic) # #### Model #1 - Evaluate - Perplexity # Calculate perplexity metric. Metric calculates and returns per-word likelihood bound using a chunk of documents as evaluation corpus. Output calculated statistics, including the perplexity=2^(-bound), to log at INFO level. Returns the variational bound score calculated for each word # In[19]: # calculate perplexity metric for model_1 train set (1000 pats dataset) perplexity_model_1train = model_1.log_perplexity(corpus_1000train) print(perplexity_model_1train) # In[20]:
# In[92]: # TODO (Lee) - confirm that filtered_data is indeed the correct dataset to pass to texts param # calculate coherence metric coherence = CoherenceModel(model=model_lda, texts=filtered_data, dictionary=id_to_word, coherence='c_v') coherence_1 = coherence.get_coherence() coherence_1 # In[94]: # calculate coherence metric or each of the n topicss coherence_1 = coherence.get_coherence_per_topic() coherence_1 # In[97]: # explore topics pyLDAvis.enable_notebook() viz_topics_1 = pyLDAvis.gensim.prepare(model_lda, corpus_train, id_to_word) viz_topics_1 # TODO (Lee) - salient vs relevant terms in pyLDA ? # ### Model 2- Mallet model
def compute_coherence_score(lda_model,reviews): coherence = CoherenceModel(lda_model,texts = reviews,dictionary = dictionary ,coherence = "c_v") return coherence.get_coherence(),coherence.get_coherence_per_topic()
def evaluate(self, topic_candidates=None, nbtopterms=None): """ evaluate topic coherence. This method is for convenience and actually redundant. The coherence scores should optimally be calculated in evaluate_topics.py which provides more features and metrics. """ self.logg('evaluating topic candidates') # reference scores per topic for top topic terms if nbtopterms is None: nbtopterms = self.nb_top_terms if topic_candidates is None: topic_candidates = self.topic_candidates topic_candidates = topic_candidates.loc[:, 'term0': f'term{nbtopterms - 1}'] topics_list = topic_candidates.values.tolist() self.logg('> u_mass') t0 = time() cm_umass = CoherenceModel(topics=topics_list, corpus=self.corpus, dictionary=self.dict_from_corpus, coherence='u_mass', topn=nbtopterms, processes=self.processes) umass_scores = cm_umass.get_coherence_per_topic(with_std=False, with_support=False) t1 = int(time() - t0) self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) self.logg('> c_v') t0 = time() cm_cv = CoherenceModel(topics=topics_list, texts=self.texts, dictionary=self.dict_from_corpus, coherence='c_v', topn=nbtopterms, processes=self.processes) cv_scores = cm_cv.get_coherence_per_topic() t1 = int(time() - t0) self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) # changed segmentation for c_uci and c_npmi from s_one_set to s_one_one (default) self.logg('> c_uci') t0 = time() cm_cuci = CoherenceModel(topics=topics_list, texts=self.texts, dictionary=self.dict_from_corpus, coherence='c_uci', topn=nbtopterms, processes=self.processes) cuci_scores = cm_cuci.get_coherence_per_topic() t1 = int(time() - t0) self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) self.logg('> c_npmi') t0 = time() cm_cuci.coherence = 'c_npmi' # reusing precalculated probability estimates cnpmi_scores1 = cm_cuci.get_coherence_per_topic() t1 = int(time() - t0) self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) scores = { 'u_mass_eval': umass_scores, 'c_v_eval': cv_scores, 'c_uci_eval': cuci_scores, 'c_npmi_eval': cnpmi_scores1, } scores = pd.DataFrame(scores) scores.index = topic_candidates.index.copy() self.eval_scores = scores return scores
def _rerank_coherence_per_metric(self, metric, coherence_model=None): """ Object method to trigger the reranking for a given metric. It uses the fast heuristic for the reranking in O(n) with n being the number of candidate terms. A coherence metric is applied on each set of topic terms, when we leave exactly one term out. The resulting coherence score indicates, if a term strengthens or weakens the coherence of a topic. We remove those terms from the set whose absence resulted in higher scores. :param metric: :param coherence_model: :return: """ if self.shifted_topics is None: self.shifted_topics = self._shift_topics() t0 = time() self.logg( f'Calculating topic candidates using {metric} coherence measure ' f'on {self.nb_candidate_terms} candidate terms ' f'for {self.nb_topics} topics') # calculate the scores for all shifted topics kwargs = dict(topics=self.shifted_topics, dictionary=self.dict_from_corpus, coherence=metric, topn=self.nb_candidate_terms - 1, processes=self.processes) if metric == 'u_mass': kwargs['corpus'] = self.corpus else: kwargs['texts'] = self.texts if coherence_model is None: cm = CoherenceModel(**kwargs) else: cm = coherence_model cm.coherence = metric scores1d = cm.get_coherence_per_topic() scores2d = np.reshape(scores1d, (self.nb_candidate_terms, -1)).T # the highest values indicate the terms whose absence improves the topic coherence most sorted_scores = np.argsort(scores2d, axis=1) # thus we will keep the first nbtopterms (default 10) indices top_scores = sorted_scores[:, :self.nb_top_terms] # and sort them back for convenience top_scores = np.sort(top_scores, axis=1) # replacing indices with token-ids tpx_ids = [ self.topic_ids.values[i, top_scores[i]] for i in range(self.nb_topics) ] tpx_ids = (pd.DataFrame.from_records( tpx_ids, columns=self.topic_terms.columns[:self.nb_top_terms], index=self.topic_ids.index).assign(metric=metric).set_index( 'metric', append=True)) t1 = int(time() - t0) self._statistics_[metric] = dict() self._statistics_[metric]['runtime'] = t1 self.logg(" done in {:02d}:{:02d}:{:02d}".format( t1 // 3600, (t1 // 60) % 60, t1 % 60)) return tpx_ids
def eval_coherence(topics, dictionary, corpus=None, texts=None, keyed_vectors=None, metrics=None, window_size=None, suffix='', cores=1, logg=print, topn=10): if not (corpus or texts or keyed_vectors): logg('provide corpus, texts and/or keyed_vectors') return if metrics is None: if corpus is not None: metrics = ['u_mass'] if texts is not None: if metrics is None: metrics = ['c_v', 'c_npmi', 'c_uci'] else: metrics += ['c_v', 'c_npmi', 'c_uci'] if keyed_vectors is not None: if metrics is None: metrics = ['c_w2v'] else: metrics += ['c_w2v'] # add out of vocabulariy terms dictionary and documents in_dict = topics.applymap(lambda x: x in dictionary.token2id) oov = topics[~in_dict] oov = oov.apply(set) oov = set().union(*oov) isstr = lambda x: isinstance(x, str) tolist = lambda x: [x] oov = sorted(map(tolist, filter(isstr, oov))) logg(f'OOV: {oov}') if oov: dictionary.add_documents(oov, prune_at=None) _ = dictionary[0] scores = dict() topics_values = topics.values for metric in metrics: t0 = time() gc.collect() logg(metric) txt = texts + oov if texts else None cm = CoherenceModel(topics=topics_values, dictionary=dictionary, corpus=corpus, texts=txt, coherence=metric, topn=topn, window_size=window_size, processes=cores, keyed_vectors=keyed_vectors) coherence_scores = cm.get_coherence_per_topic(with_std=True, with_support=True) scores[metric + suffix] = coherence_scores gc.collect() t1 = int(time() - t0) logg(" done in {:02d}:{:02d}:{:02d}".format(t1 // 3600, (t1 // 60) % 60, t1 % 60)) df = pd.DataFrame(scores) df.index = topics.index gc.collect() return df