def lda_move_topics(dictionary, corpus, texts, limit, start=2, step=1): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics perplexity_values : Perplexity values corresponding to the LDA model with respective number of topics """ coherence_values = [] perplexity_values = [] model_list = [] for num_topics in range(start, limit, step): model = LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary, passes=2, workers=2) model_list.append(model) coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v') coherence_values.append(coherencemodel.get_coherence()) perplexity_values.append(model.log_perplexity(corpus)) return model_list, coherence_values, perplexity_values
def models_codherence_perplexity(texts, bows, dic, topic_start=100, topic_end=201, step=10, chunk=10, passes=3, cores=2): """ Build models on a range of number of topics to compare quality. The output is 3 lists of: 1. List of built models 2. List of coherence scores calculated on texts 3. List of perplexity scores calculated on bows -------------------- Parameter: texts: list of list of tokens bows: list of list of BoWs dic: dictionary of id <-> word topic_start, topic_end, step: range of number of topics chunk: number of data used in each training step passes: number of passes through the whole training data cores: number of cores use for parallel training Return: models, coherence_scores, perplexity_scores """ models = [] coherence_scores = [] perplexity_scores = [] for num_topics in range(topic_start, topic_end, step): print('Building model of %d topics' % (num_topics)) # Build topic model for the given number of topics: model = LdaMulticore(corpus=bows, id2word=dic, eta='auto', num_topics=num_topics, chunksize=chunk, passes=passes, workers=cores) # Build coherence model to test the topic model: coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dic, coherence='c_v') # Save the results: models.append(model) coherence_scores.append(coherence_model.get_coherence()) perplexity_scores.append(model.log_perplexity(bows)) return models, coherence_scores, perplexity_scores
def check_perplexity(self, model: LdaMulticore, valid_corpus: list) -> bool: """ perwordbound から perplexityを計算してログに追加 - perplexity: 要はエントロピー。詳しくは書籍:統計的学習の基礎(厚み10cm!)が説明が端折られず分かりやすい - perwordbound: - gensim標準のperplexity計算メソッドlog_perplexityの戻り値 - the variational bound of documents from the corpus as E_q[log p(corpus)] - E_q[log q(corpus)] - 推定分布と真の分布の差...迷いの大きさに-1をかけた値と思っておけばOK :param model: LdaMulticore gensimのLDAモデル :param valid_corpus: list{iteratable of (int, int or float)}, コーパス :return: """ perwordbound = model.log_perplexity(valid_corpus) perplexity = np.exp2(-perwordbound) self.log.append(perplexity) return True
def main(): args = parse_args() print('* Loading data from ', args.input) data_df = pd.read_csv(args.input, header=0, index_col=0, sep='\t') samples_bow = counts_to_bow(data_df.values) samples_train, samples_test = train_test_split(samples_bow, test_size=0.1) print('* Training LDA models') print('\t - # folds', args.n_folds) print('\t - # Ranks', args.rank_range) print('* Perplexities: ') kf = KFold(n_splits=args.n_folds) perps = [] Ks = [] models = defaultdict(list) for train_index, test_index in kf.split(samples_bow): samples_train = [samples_bow[i] for i in train_index] samples_test = [samples_bow[i] for i in test_index] for K in args.rank_range: lda = LdaMulticore( samples_train, num_topics=K, workers=args.cores) perp = lda.log_perplexity(samples_test) print('\t {}: {}'.format(K, perp)) Ks.append(K) perps.append(perp) models[K].append(lda) print(perps, Ks) perp_df = pd.DataFrame({'log_perplexity': perps, 'num_topics': Ks}) ax = sns.boxplot(x='num_topics', y='log_perplexity', data=perp_df) plot = ax.get_figure() plot.savefig(args.output_file)
# GENSIM start = time.time() lda_gensim_mc = LdaMulticore(gensim_tr_corpus, id2word=id2word, decay=decay, offset=offset, num_topics=NB_TOPICS, passes=max_iterations, batch=False, chunksize=batch_size, iterations=max_e_steps, eval_every=eval_every) gn_time = time.time() - start log_prep_gensim_mc = lda_gensim_mc.log_perplexity(gensim_te_corpus) preplexity_gensim_mc = np.exp(-1. * log_prep_gensim_mc) print("gensim run time and perplexity: {}, {}".format(gn_time, preplexity_gensim_mc)) print("sklearn run time and perplexity: {}, {}".format(sk_time, sklearn_perplexity)) # Lets have a look to the topics topic_words = dict() gensim_topics = lda_gensim_mc.show_topics(formatted=False) def sklearn_show_topics(model, feature_names, n_top_words): sk_topics = [] for topic_idx, topic in enumerate(model.components_):
def fit_tm_gensim( corpus: 'gensim.corpus' = None, dictionary: 'Dictionary' = None, text: list = None, range_topics: list = None, passes: int = 10, per_word_topics: bool = True, ) -> (list, list, list): """ fit topic modeling model gensim, multicore (using LdaMulticore) Parameters ---------- corpus : 'gensim.corpus' aka {iterable of list of (int, float), scipy.sparse.csc} Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`) dictionary: 'Dictionary' aka {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`} Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for debugging and topic printing. text : list list text tokens like text at `corpora.Dictionary` range_topics: list list range topics, like `[20, 25, 30, 35, 40]` passes: int (default = 10) number of passes through the corpus during training per_word_topics : bool (default = True) if True, the model also computes a list of topics, sorted in descending order of most likely topics for each word, along with their phi values multiplied by the feature length (i.e. word count) Returns ------- meta_model : dict meta model fitted model with next items: model, coherence, perplexity, time """ meta_model = {} for num_topic in range_topics: print(f'#topic {num_topic} ..........') time_start = datetime.now() # fit models TM model_gensim = LdaMulticore( corpus=corpus, id2word=dictionary, num_topics=num_topic, passes=passes, per_word_topics=True ) # evaluation # coherence coherence_model = CoherenceModel(model=model_gensim, texts=text, dictionary=dictionary, coherence='c_v') coherence_value = coherence_model.get_coherence() print(f'\tcoherence score: {coherence_value}') # perplexity perplexity_value = model_gensim.log_perplexity(corpus) print(f'\tperplexity score: {perplexity_value}') # time time_end = datetime.now() - time_start print(f'\n\ttime: {time_end}') sec = time_end.seconds hours = sec // 3600 minutes = (sec // 60) - (hours * 60) meta_model[num_topic] = { 'model': model_gensim, 'coherence': coherence_value, 'perplexity': perplexity_value, 'time_fit': {'hours': hours, 'min': minutes, 'sec': sec}, } return meta_model
alpha='asymmetric', eval_every=eval_every, workers=3, random_state=seed) # Check resulting topics. listOfTopics = ldaModel.print_topics(num_topics=numberOfTopics, num_words=15) for index, i in enumerate(listOfTopics): string = str(i[1]) for c in "0123456789+*\".": string = string.replace(c, "") string = string.replace(" ", " ") print(string) # calculate & display perplexity print('\nPerplexity: ', ldaModel.log_perplexity( corpus)) # a measure of how good the model is. lower the better. # calculate & display coherence coherenceModel = CoherenceModel(model=ldaModel, texts=document, dictionary=dictionary, coherence='c_v') ldaCoherence = coherenceModel.get_coherence() print('\nCoherence Score: ', ldaCoherence) # assign a file name based on the loop number so that models aren't overridden during successive iterations. path = './models/both/nouns_only' if not os.path.exists(path): os.makedirs(path) ldaModel.save(f'./models/both/nouns_only/model1-{loopNum}.model')
class GensimMalletTopicExtractor: def __init__(self, language='english', stopwords_extent=None): self.language2la = { 'english': 'en', 'french': 'fr', 'spanish': 'es' } if language not in self.language2la: raise ValueError('Language must be "english", "french" or "spanish"') self.language = language self.stop_words = stopwords.words(self.language) if stopwords_extent is str or stopwords_extent is list: self.stop_words.extend(stopwords_extent) self.df_topic_sents_keywords = None self.bigram = None self.bigram_phraser = None self.trigram = None self.trigram_phraser = None self.vis = None self.data = None self.data_words = None self.data_words_nostops = None self.data_words_bigrams = None self.data_words_trigrams = None self.nlp = None self.data_lemmatized = None self.id2word = None self.texts = None self.corpus = None self.mallet_path = None self.lda_model = None self.coherence_model_lda = None self.coherence_lda = None self.coherence_values = [] self.model_list = [] self.optimal_number_of_topics = None self.optimal_model = None self.optimal_topics = None @staticmethod def sent_to_words(sentences, remove_punctuation=True): for sentence in sentences: # deacc=True removes punctuations yield(simple_preprocess(str(sentence), deacc=remove_punctuation)) def remove_stopwords(self, texts): return [[word for word in simple_preprocess(str(doc)) if word not in self.stop_words] for doc in texts] def make_bigrams(self, texts): self.bigram = Phrases(self.data_words, min_count=5, threshold=100) self.bigram_phraser = Phraser(self.bigram) return [self.bigram_phraser[doc] for doc in texts] def make_trigrams(self, texts): tokens_ = self.bigram_phraser[texts] self.trigram = Phrases(tokens_, threshold=100) self.trigram_phraser = Phraser(self.trigram) return [self.trigram_phraser[self.bigram_phraser[doc]] for doc in texts] def lemmatization(self, texts, allowed_postags=None): if allowed_postags is None: allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV'] """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = self.nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out def view_terms_frequency(self, text_id, first_words=20): # Human readable format of corpus (term-frequency) list_ = [[(self.id2word[id_], freq) for id_, freq in text[:first_words]] for text in self.corpus[text_id]] pprint(list_) def visualize_lda(self): # Visualize the topics # pyLDAvis.enable_notebook() self.vis = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word) print(self.vis) def instanciate_model(self, num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False): if enable_mallet is True: # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip os.environ.update({'MALLET_HOME': r'C:/mallet-2.0.8/'}) self.mallet_path = 'C:\\mallet-2.0.8\\bin\\mallet' # update this path self.lda_model = LdaMallet(self.mallet_path, corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=iterations, optimize_interval=optimize_interval, topic_threshold=topic_threshold) print('Mallet LDA model built\n') if show_topics_on_creation is True: pprint(self.lda_model.show_topics(formatted=False)) else: self.lda_model = LdaMulticore(corpus=self.corpus, id2word=self.id2word, num_topics=num_topics, random_state=100, chunksize=500, passes=passes, iterations=iterations, per_word_topics=True) print('LDA_MultiCore model built\n') if show_topics_on_creation is True: pprint(self.lda_model.print_topics()) def extract_topics(self, data, num_topics, passes=10, iterations=500, enable_mallet=True, optimize_interval=0, topic_threshold=0.0): self.data = data print('\nEXTRACTING ' + str(num_topics) + ' TOPICS') self.data_words = list(self.sent_to_words(self.data, True)) # Remove Stop Words print('\nRemoving stopwords') self.data_words_nostops = self.remove_stopwords(self.data_words) # Form Bigrams print('Looking for bigrams') self.data_words_bigrams = self.make_bigrams(self.data_words_nostops) # Form Trigrams print('Looking for trigrams') self.data_words_trigrams = self.make_trigrams(self.data_words_nostops) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en print('Loading Spacy with ' + self.language + ' dictionary') self.nlp = spacy.load(self.language2la[self.language], disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv print('Lemmatizing') self.data_lemmatized = self.lemmatization(self.data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Create Dictionary print('Creating dictionary') self.id2word = corpora.Dictionary(self.data_lemmatized) # Create Corpus print('Creating corpus') self.texts = self.data_lemmatized # Term Document Frequency print('Computing document frequency') self.corpus = [self.id2word.doc2bow(text) for text in self.texts] # Build LDA model print('\nEnable_mallet is', enable_mallet, '\n') self.instanciate_model(num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=True) # print(self.lda_model[self.corpus]) # Compute Perplexity # a measure of how good the model is. lower the better. if hasattr(self.lda_model, 'log_perplexity'): print('\nPerplexity: ', self.lda_model.log_perplexity(self.corpus)) # Compute Coherence Score print('\nComputing coherence model') self.coherence_model_lda = CoherenceModel(model=self.lda_model, texts=self.data_lemmatized, dictionary=self.id2word, coherence='c_v') print('Getting coherence') self.coherence_lda = self.coherence_model_lda.get_coherence() print('\nCoherence Score: ', self.coherence_lda) if enable_mallet is False: self.visualize_lda() def view_optimal_topics(self, num_words=20): pprint(self.optimal_model.print_topics(num_words=num_words)) def compute_coherence_values(self, limit, start=2, step=3, passes=10, iterations=500, enable_mallet=True, optimize_interval=0, topic_threshold=0.0): """ Compute c_v coherence for various number of topics Parameters: ---------- limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ for num_topics in range(start, limit, step): print('\n' + '*'*10 + ' COMPUTING COHERENCE FOR ' + str(num_topics) + ' TOPICS ' + '*'*10) self.instanciate_model(num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False) self.model_list.append(self.lda_model) coherence_model = CoherenceModel(model=self.lda_model, texts=self.data_lemmatized, dictionary=self.id2word, coherence='c_v') self.coherence_values.append(coherence_model.get_coherence()) # Show graph x = range(start, limit, step) plt.plot(x, self.coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend("coherence_values", loc='best') plt.show() # Print the coherence scores for m, cv in zip(x, self.coherence_values): print("Num Topics =", m, " has Coherence Value of", round(cv, 4)) optimal_model_index = self.coherence_values.index(max(self.coherence_values)) self.optimal_number_of_topics = start + optimal_model_index self.optimal_model = self.model_list[optimal_model_index] print('\nOptimal number of topics is ' + str(self.optimal_number_of_topics) + ' with coherence score : ' + str(self.coherence_values[optimal_model_index])) self.optimal_topics = self.optimal_model.show_topics(num_topics=self.optimal_number_of_topics, num_words=20, formatted=False) self.view_optimal_topics() def format_topics_sentences(self, ldamodel=None): if ldamodel is None and self.optimal_model is not None: ldamodel = self.optimal_model elif ldamodel is None and self.lda_model is not None: ldamodel = self.lda_model # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[self.corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] # Add original text to the end of the output contents = pd.Series(self.data) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return sent_topics_df def get_most_representative_documents(self): # Group top 5 sentences under each topic sent_topics_sorteddf_mallet = pd.DataFrame() if self.df_topic_sents_keywords is None: self.df_topic_sents_keywords = self.format_topics_sentences() # Format df_dominant_topic = self.df_topic_sents_keywords.reset_index() df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'] sent_topics_outdf_grpd = self.df_topic_sents_keywords.groupby('Dominant_Topic') for i, grp in sent_topics_outdf_grpd: sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], axis=0) # Reset Index sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True) # Format sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"] # Show sent_topics_sorteddf_mallet.head() for i in range(len(sent_topics_sorteddf_mallet)): print(i, sent_topics_sorteddf_mallet.loc[i, 'Text']) def get_topic_distribution(self): if self.df_topic_sents_keywords is None: self.df_topic_sents_keywords = self.format_topics_sentences() # Number of Documents for Each Topic topic_counts = self.df_topic_sents_keywords['Dominant_Topic'].value_counts() # Percentage of Documents for Each Topic topic_contribution = round(topic_counts/topic_counts.sum(), 4) # Topic Number and Keywords topic_num_keywords = self.df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']] # Concatenate Column wise df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1) # Change Column names df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents'] # Show print(df_dominant_topics)
test_corpus = Sparse2Corpus(test_dtm, documents_columns=False) timing = [] for workers in [8, 16]: for num_topics in [10, 50]: print('start', workers, num_topics, end=' ') start = time() lda = LdaMulticore(corpus=train_corpus, num_topics=num_topics, id2word=id2word, chunksize=1000, passes=1, eval_every=None, workers=workers, random_state=42) duration = time() - start test_perplexity = 2**(-lda.log_perplexity(test_corpus)) timing.append([workers, num_topics, duration, test_perplexity]) print(format_time(duration), test_perplexity) pd.DataFrame(timing, columns=[ 'workers', 'num_topics', 'duration', 'test_perplexity' ]).to_csv(f'timings_{workers}.csv', index=False) exit() test_vocab = test_dtm.count_nonzero() perplexity, coherence = [], [] for num_topics, passes in model_params: model_path = vocab_path / str(num_topics) / str(passes) if not model_path.exists(): model_path.mkdir(exist_ok=True, parents=True)