def __buildNMF(self, num_topics, chunksize, passes): self.__model = Nmf(self.__corpus, id2word=self.__corpus.getDictionary(), num_topics=num_topics, chunksize=chunksize, passes=passes, eval_every=None, random_state=10)
def topic_modeling(method, num_topics, corpus, dictionary): if method == 'LDA': # Build LDA model model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42, update_every=1, chunksize=10, passes=10, alpha='symmetric', iterations=100, per_word_topics=True) elif method == 'NMF': #Build NMF model model = Nmf(corpus=corpus, num_topics=num_topics, id2word=dictionary, chunksize=10, passes=10, kappa=0.5, w_max_iter=200, h_max_iter=50, eval_every=1, normalize=True, random_state=42) else: raise ValueError('method is invalid') return model
def load(self, model_name): self.__modelName = model_name if model_name == 'lda': self.__model = LdaMulticore.load(self.__modelFile) elif model_name == 'nmf': self.__model = Nmf.load(self.__modelFile)
def create_nmf_model(id_dict, corpus, num_topics): nmf_model = Nmf(corpus=corpus, id2word=id_dict, num_topics=num_topics, random_state=100, chunksize=100, passes=50) return nmf_model
def __buildLDA(self, num_topics, chunksize, passes): self.__model = LdaMulticore(self.__corpus, id2word=self.__corpus.getDictionary(), num_topics=num_topics, chunksize=chunksize, passes=passes, eval_every=None, workers=40, random_state=10)
def find_cv(): #Find Number of K for num in topic_nums: nmf = Nmf(corpus=corpus, num_topics=num, id2word=dictionary, normalize=True) cm = CoherenceModel(model=nmf, texts=texts, dictionary=dictionary, coherence='c_v') coherence_scores.append(round(cm.get_coherence(), 5))
def __create_model(self, algo, topic_qtt): model = None if (algo == TopicModelingAlgorithm.LDA): model = LdaModel(corpus=self.__corpus, num_topics=topic_qtt, id2word=self.__id2_words, random_state=1) elif (algo == TopicModelingAlgorithm.LSA): model = LsiModel(corpus=self.__corpus, num_topics=topic_qtt, id2word=self.__id2_words) elif (algo == TopicModelingAlgorithm.NMF): model = Nmf(corpus=self.__corpus, num_topics=topic_qtt, random_state=1) return model
def nmf_search(texts, query, num_topics, passes=20, random_state=None): tfidf_model, dic, text_tfidf_weights = get_tfidfmodel_and_weights(texts) # NMFモデルを作成 nmf_model = Nmf(corpus=text_tfidf_weights, id2word=dic, num_topics=num_topics, passes=passes, random_state=random_state) # TF・IDFによる文書ベクトルをトピックベースのベクトルに変換 nmf_weights = nmf_model[text_tfidf_weights] index = MatrixSimilarity(nmf_weights, num_features=len(dic)) # クエリのトピックベースのベクトルを作成 query_bows = get_bows([query], dic) query_tfidf_weights = get_weights(query_bows, dic, tfidf_model) query_nmf_weights = nmf_model[query_tfidf_weights] # クエリとの類似性で文書をランキング sims = index[query_nmf_weights[0]] return sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
def nmf(filename): # tf-idf min_df = 10 max_df = 0.60 max_features = 5000 # sklearn-model nmf # get data newspaper_input = pd.read_csv(filename, na_filter=False) newspaper_input['processed_text'] = newspaper_input['article'].apply( process_text) texts = newspaper_input['processed_text'] # create dictionary to pass as input to gensim model dictionary = Dictionary(newspaper_input['processed_text']) # filter out words that are above or below the thresholds set dictionary.filter_extremes(no_below=10, no_above=0.60, keep_n=5000) # convert to bag of words (corpus) to pass to gensim nmf model # [[(word_id, # times word appears in document),...],...] corpus = [dictionary.doc2bow(text) for text in texts] # find optimal number of topics using gensim NMF https://radimrehurek.com/gensim/models/nmf.html # testing topic numbers 10,15,20...55 to find best number to fit the data topic_nums = list(np.arange(10, 56, 5)) coherence_scores = [] for num in topic_nums: # initialize NMF model nmf = Nmf( corpus=corpus, num_topics=num, id2word=dictionary, chunksize= 500, #Number of documents to be used in each training chunk passes=10, #Number of full passes over the training corpus kappa=0.1, #Gradient descent step size minimum_probability=0.001, w_max_iter= 300, # Maximum number of iterations to train W per each batch w_stop_condition= 0.0001, # If error difference gets less than that, training of W stops for the current batch h_max_iter=100, h_stop_condition=0.001, normalize=True, random_state=42) # initialize Coherence Model https://radimrehurek.com/gensim/models/coherencemodel.html # Calculate topic coherence for topic models cm = CoherenceModel(model=nmf, texts=texts, dictionary=dictionary, coherence='c_v') coherence_scores.append(round(cm.get_coherence(), 5)) # get list of different topic numbers and their respective scores scores = list(zip(topic_nums, coherence_scores)) # sort scores by score (not topic_num) scores = sorted(scores, key=itemgetter(1), reverse=True) # get the best number of topics best_num_topics, best_coherence_score = scores[0] # best_coherence_score = scores[0][1] print('scores: ', scores) print('num_topics: ', str(best_num_topics)) print('coherence: ', str(best_coherence_score)) # print(df.head()) tfidf_vectorizer = TfidfVectorizer( ngram_range=(1, 2), # unigrams and bigrams max_df=0.60, min_df=10, max_features=5000, preprocessor=' '.join) # fit+transform: returns document-term matrix (frequency of word i in document j) tfidf = tfidf_vectorizer.fit_transform(texts) # all the words we'll be looking at tfidf_fn = tfidf_vectorizer.get_feature_names() # grid search for best alpha, l1_ratio combination # measured by lowest sum squared residual # l1_ratio: regularization mixing parameter (0 => l2 penalty, 1 => l1 penalty, (0,1) => mixture) # alpha: constant that multiplies the regularization terms (0 => no regularization) squared_residuals = [] params = {} models = [] sorted_articles_dfs = [] complete_topics_dfs = [] alphas = list(np.arange(0.0, 1.2, 0.2)) l1_ratios = list(np.arange(0.0, 1.2, 0.2)) count_params = 0 successes = 0 count_successes = {} for a in alphas: for b in l1_ratios: # print('alpha: {}, l1_ratio: {}'.format(a,b)) # learn a model nmf = NMF( n_components=best_num_topics, init= 'nndsvd', # Non-negative double singular value decomposition max_iter=500, l1_ratio=b, solver='cd', # coordinate descent alpha=a, tol=0.0001, # 0.001 random_state=42).fit(tfidf) try: # transforms documents -> document-term matrix, transforms data according to model docweights = nmf.transform(tfidf) # (articles x topics) # topic dataframe: (best_num_topics x 8) # (topic num : top 8 words that describe the topic) n_top_words = 8 topic_df = topic_table(nmf, tfidf_fn, n_top_words).T # clean the topic words topic_df['topics'] = topic_df.apply(lambda x: [' '.join(x)], axis=1) topic_df['topics'] = topic_df['topics'].str[0] topic_df['topics'] = topic_df['topics'].apply( lambda x: whitespace_tokenizer(x)) topic_df['topics'] = topic_df['topics'].apply( lambda x: unique_words(x)) topic_df['topics'] = topic_df['topics'].apply( lambda x: [' '.join(x)]) topic_df['topics'] = topic_df['topics'].str[0] # clean topic dataframe topic_df = topic_df['topics'].reset_index() topic_df.columns = ['topic_num', 'topics'] topics = topic_df[['topic_num', 'topics']] # assign topics to each article title = newspaper_input['title'].tolist() df_temp = pd.DataFrame({ 'title': title, 'topic_num': docweights.argmax(axis=1) }) merged_topic = df_temp.merge(topic_df, on='topic_num', how='left') complete_df = merged_topic.merge(newspaper_input, on='title', how='left') # complete_df = complete_df.drop('processed_text', axis=1) # maybe unecessary ? complete_df = complete_df.drop_duplicates(subset=['title']) sorted_articles = complete_df.sort_values(by=['topic_num']) # get num articles per topic num_articles_per_topic = [] for topic in range(best_num_topics): count = 0 for index, row in sorted_articles.iterrows(): if row['topic_num'] == topic: count += 1 num_articles_per_topic.append(count) # keep track of how many articles are given each topic topics['num_articles'] = num_articles_per_topic # matrices from nmf (A = WH) mat_A = tfidf_vectorizer.transform(texts) mat_W = nmf.components_ mat_H = nmf.transform(mat_A) # residuals: measurement of how well the topics approximate the data (observed value - predicted value) # 0 -> topic perfectly predicts data # residual = Frobenius norm tf-idf weights (A) - coefficients of topics (H) X coefficients of topics (W) r = np.zeros(mat_A.shape[0]) # num articles for row in range(mat_A.shape[0]): r[row] = np.linalg.norm( mat_A[row, :] - mat_H[row, :].dot(mat_W), 'fro') sum_sqrt_res = round(sum(np.sqrt(r)), 3) squared_residuals.append(sum_sqrt_res) # add avg residual column to topics complete_df['resid'] = r sorted_articles = complete_df.sort_values(by=['topic_num']) resid_data = complete_df[[ 'topic_num', 'resid' ]].groupby('topic_num').mean().sort_values(by='resid') complete_topics = topics.merge(resid_data, on='topic_num', how='left') # save results sorted_articles_dfs.append(sorted_articles) complete_topics_dfs.append(complete_topics) models.append(nmf) count_successes[count_params] = successes successes += 1 except Exception as e: # print('test {}, error occurred'.format(count_params)) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) # print('test {} complete'.format(count_params)) params[count_params] = (a, b) count_params += 1 # find best params params_test = np.arange(36) resid_scores = list(zip(params_test, squared_residuals)) resid_scores = sorted(resid_scores, key=itemgetter(1)) best_params = resid_scores[0][0] print('test #{} had best residual score'.format(best_params)) print('params: a={}, b={}'.format(params[best_params][0], params[best_params][1])) print('residual scores: {}'.format(resid_scores)) best_articles = sorted_articles_dfs[count_successes[best_params]] best_topics = complete_topics_dfs[count_successes[best_params]] # call function that uses svc model to predict category based on topic words best_topics = predict_category(best_topics) # save best topics for idx, row in best_topics.iterrows(): new_words = '' topics_itr = row['topics'].split() for word in topics_itr: new_words += get_unstemmed_word(word) new_words += ' ' best_topics.at[idx, 'topics'] = new_words categories = [] for idx, row in best_articles.iterrows(): topic_num = row['topic_num'] topics = best_topics.at[topic_num, 'topics'] categories.append(best_topics.at[topic_num, 'predicted_category']) best_articles.at[idx, 'topics'] = topics best_articles['predicted_category'] = categories best_articles = best_articles.drop('processed_text', axis=1) best_articles = best_articles.drop('Unnamed: 0', axis=1) best_articles.to_csv('../output/topic/articles_with_nmf_topics.csv', header=True, index=False) best_topics.to_csv('../output/topic/nmf_generated_topics.csv', header=True, index=False) # save model with open('nmf_model.pickle', 'wb') as output: pickle.dump(models[best_params], output) with open('nmf_tfidf.pickle', 'wb') as output: pickle.dump(tfidf_vectorizer, output)
# import argparse from gensim.corpora import Dictionary, MmCorpus from gensim.models.nmf import Nmf from gensim.models import TfidfModel from codebase.utils import TweetRawCorpusStream from codebase.topic_utilities import export_dtm if __name__ == "__main__": corpora_path = "./corpora/" model_path = "./models/" num_topics = 50 model_suffix = "-{}topics".format(num_topics) modelTag = "Seventh-and-EighthWeek-Tweets-Rolling" nmf = Nmf.load("{}{}{}.model".format(model_path, modelTag, model_suffix)) fileTag_list = ["First-and-SecondWeek-Tweets-Rolling"] for fileTag in fileTag_list: tfidf_corpus = MmCorpus('{}{}-tf-idf.mm'.format(corpora_path, fileTag)) export_dtm(nmf=nmf, corpus=tfidf_corpus,\ out_path="{}{}{}-dtm.csv".format(model_path, fileTag, model_suffix),\ stop_at=None)
common_dictionary = Dictionary(docs) common_corpus = [common_dictionary.doc2bow(text) for text in docs] # for k in range(4, 10): # nmf = Nmf(common_corpus, num_topics=k) # c_model = CoherenceModel(model=nmf, corpus=common_corpus, dictionary=common_dictionary, texts=docs, coherence='c_v') # print(k, c_model.get_coherence()) # x = PrettyTable() # x.field_names = [''] + [ "t" + str(i+1) for i in range(0,10)] # for i in range(0,k): # x.add_row([i] + [ common_dictionary[term] for (term, w) in nmf.get_topic_terms(i)]) # print(x) from gensim.matutils import jaccard import random nmf = Nmf(common_corpus, num_topics=9) texts = random.choices(docs, k=20) texts = [docs[0], docs[20], docs[80], docs[90], docs[200], docs[210]] #[docs[i] for i in range(0, len(docs), 30)] def get_most_likely_topic(doc): bow = common_dictionary.doc2bow(doc) topics, probabilities = zip(*nmf.get_document_topics(bow)) max_p = max(probabilities) topic = topics[probabilities.index(max_p)] return topic colors = ["skyblue", "pink", "red", "green", "yellow", "cyan", "purple", "magenta", "orange", "blue"] def get_node_color(i): return colors[get_most_likely_topic(texts[i])] # return 'skyblue' if get_most_likely_topic(texts[i]) == 0 else 'pink'
def compute_coherence_values(self, limit, start=2, step=3, model_type="lda", corpus_type="bow", show_details=False): """ Compute c_v coherence for various number of topics Parameters: ---------- dictionary : Gensim dictionary corpus : Gensim corpus texts : List of input texts limit : Max num of topics model_type : lda or mallet or nmf Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ self.get_term_doc_frequency() coherence_values = [] model_list = [] topics_num_arr = [] os.environ[ 'MALLET_HOME'] = 'C:\\Mallet\\' # for windows make sure you put the mallet folder under C and unzip it mallet_path = 'C:\\Mallet\\bin\\mallet' corpus_to_train = self.corpus if corpus_type == 'tfidf': print("training on TFIDF") tfidf = models.TfidfModel(self.corpus) corpus_to_train = tfidf[self.corpus] else: print("training on BOW") #mallet_path = 'C:...mallet_unzipped\\mallet-2.0.8\\bin\\mallet' for num_topics in range(start, limit, step): if model_type == "lda": model = gensim.models.ldamodel.LdaModel(corpus=corpus_to_train, id2word=self.id2word, num_topics=num_topics, random_state=1, update_every=1, chunksize=10, passes=10, alpha='auto', per_word_topics=True) elif model_type == "mallet": model = gensim.models.wrappers.LdaMallet( mallet_path, corpus=corpus_to_train, num_topics=num_topics, id2word=self.id2word, random_seed=1) elif model_type == "nmf": model = Nmf(corpus=corpus_to_train, num_topics=num_topics, id2word=self.id2word, random_state=1) else: print( 'model {} is not supported. the models are *lda*, *mallet* and *nmf*' ) model_list.append(model) topics_num_arr.append(num_topics) coherence_model = CoherenceModel(model=model, texts=self.data_lemmatized, dictionary=self.id2word, coherence='c_v') coherence_num = coherence_model.get_coherence() coherence_values.append(coherence_num) print(num_topics, ': - coherence:', coherence_num) optimal_idx = np.argmax(coherence_values) self.model = model_list[optimal_idx] self.num_topics = topics_num_arr[optimal_idx] print("optimal model has a coherence value of ", round(coherence_values[optimal_idx], 2), ' and # topics: ', (topics_num_arr[optimal_idx])) # Visualize topic_modeler.show_coherence_vals_graph(coherence_values, limit, start=start, step=step) if show_details: for m, cv in zip(topics_num_arr, coherence_values): print("Num Topics =", m, " has Coherence Value of", round(cv, 4)) return model_list, coherence_values
def main(query,output_filename,window=50,topicn=50): print ('Training nmf model began') frame = inspect.currentframe() args, _, _, values = inspect.getargvalues(frame) query_parameters = [(i, values[i]) for i in args] document_collection_original=blacklab.search_blacklab(query,window=window,lemma=True, include_match=False) print ("Search finished") document_collection=[match['complete_match'].strip() for match in document_collection_original[0:100]] #Use the phraser model phraser_model = Phraser(Phrases.load(constants.OUTPUT_FOLDER+'phrase_model')) document_collection=[' '.join(phraser_model[match['complete_match'].strip().split()]) for match in document_collection_original] print ("Phraser model done") #get rid of stop words document_collection_filtered = document_collection ''' for text in document_collection: new_text = [] for word in text.split(): if (word not in set(stopwords.words('english')) and (word[0] in string.ascii_uppercase + string.ascii_lowercase)): new_text.append(word) document_collection_filtered.append(' '.join(new_text)) ''' print ("Filtering done") #build the corpus preprocessed_corpus = [] for i,text in enumerate(document_collection_filtered): if i==0: print (i) text = text.split() dct=gensim_utils.initialize_gensim_dictionary([text]) else: print (i) text = text.split() gensim_utils.add_documents_to_gensim_dictionary(dct,[text]) #Filter it here dct.filter_extremes(no_below=10, no_above=0.95) gensim_corpus = [dct.doc2bow(bag_of_word.split()) for bag_of_word in document_collection_filtered] #text = document_collection_filtered[0].split() nmf = Nmf(gensim_corpus, num_topics=50) words = list(dct.token2id.keys()) topics = nmf.print_topics(50) for topic in topics: topic_words = topic[1].split('+') print_topic = [] for topic_word in topic_words: print_topic.append(words[int(topic_word.split('*')[1][1:].strip()[:-1])]) print (' '.join(print_topic)) #get topic of a given document: nmf.get_document_topics(gensim_corpus[0]) #dct.token2id.keys() #nmf.show_topic(10) #nmf.get_document_topics(dct.doc2bow(preprocessed_corpus[0])) pdb.set_trace()
class TopicModel(object): def __init__(self): self.__corpus = None self.__modelName = None self.__model = None self.__modelFile = 'results/model.bin' self.__coherenceModel = None def setCorpus(self, corpus): self.__corpus = corpus def getCoherence(self): return self.__coherenceModel.get_coherence() def getDocumentTopics(self, document, threshold=None): return self.__model.get_document_topics(document, threshold) def build(self, model_name, num_topics, chunksize, passes, corpus=None): self.__modelName = model_name # Update corpus if necessary if isinstance(corpus, Corpus): self.__corpus = corpus # Build topic model if model_name == 'lda': self.__buildLDA(num_topics, chunksize, passes) elif model_name == 'nmf': self.__buildNMF(num_topics, chunksize, passes) # Build coherence model self.__buildCoherenceModel() def __buildLDA(self, num_topics, chunksize, passes): self.__model = LdaMulticore(self.__corpus, id2word=self.__corpus.getDictionary(), num_topics=num_topics, chunksize=chunksize, passes=passes, eval_every=None, workers=40, random_state=10) def __buildNMF(self, num_topics, chunksize, passes): self.__model = Nmf(self.__corpus, id2word=self.__corpus.getDictionary(), num_topics=num_topics, chunksize=chunksize, passes=passes, eval_every=None, random_state=10) def __buildCoherenceModel(self): self.__coherenceModel = CoherenceModel(model=self.__model, texts=self.__corpus.getTexts(), coherence='c_v', processes=7) def __printTopics(self): print(' Topics') for idx, topic in self.__model.print_topics(-1): print(' {}: {}'.format(idx, topic)) def save(self): self.__model.save(self.__modelFile) def load(self, model_name): self.__modelName = model_name if model_name == 'lda': self.__model = LdaMulticore.load(self.__modelFile) elif model_name == 'nmf': self.__model = Nmf.load(self.__modelFile)
dest="preFileTag", help='preFileTag used to select previous nmf model') args = parser.parse_args() num_topics = args.num_topics fileTag = args.fileTag preFileTag = args.preFileTag preDictTag = args.preDictTag corpora_path = "./corpora/" model_path = "./models/" model_suffix = "-{}topics".format(num_topics) #### Step 1, Load Corpus #### if (preFileTag == None) and (preDictTag == None): dct = Dictionary.load('{}{}.dict'.format(corpora_path, fileTag)) # for rolling way first built model elif (preFileTag == None) and (preDictTag != None): dct = Dictionary.load('{}{}.dict'.format(corpora_path, preDictTag)) tfidf_corpus = MmCorpus('{}{}-tf-idf.mm'.format(corpora_path, fileTag)) #### Step 2, train NMF to extract topic patterns #### if preFileTag == None: nmf = Nmf(tfidf_corpus, id2word=dct, num_topics=num_topics) elif preFileTag != None: nmf = Nmf.load("{}{}{}.model".format(model_path, preFileTag, model_suffix)) nmf.update(tfidf_corpus) #### Step 3, export model #### nmf.save("{}{}{}.model".format(model_path, fileTag, model_suffix))
def train(self, data=AbstractModel.ROOT + '/data/test.txt', num_topics=20, preprocessing=False, passes=1, kappa=1.0, minimum_probability=0.01, w_max_iter=200, w_stop_condition=0.0001, h_max_iter=50, h_stop_condition=0.001, eval_every=10, normalize=True, random_state=None): """ Train the model and generate the results on the corpus :param data: The training corpus as path or list of strings :param int num_topics: The desired number of topics :param bool preprocessing: If true, apply preprocessing to the corpus :param int passes: Number of full passes over the training corpus. Leave at default passes=1 if your input is an iterator. :param float kappa: Gradient descent step size. Larger value makes the model train faster, but could lead to non-convergence if set too large. :param float minimum_probability: If normalize is True, topics with smaller probabilities are filtered out. If normalize is False, topics with smaller factors are filtered out. If set to None, a value of 1e-8 is used to prevent 0s. :param float w_max_iter: Maximum number of iterations to train W per each batch. :param float w_stop_condition: If error difference gets less than that, training of W stops for the current batch. :param float h_max_iter: Maximum number of iterations to train h per each batch. :param float h_stop_condition: If error difference gets less than that, training of h stops for the current batch. :param int eval_every: Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low. :param bool normalize: Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c. :param int random_state: Seed for random generator. Needed for reproducibility. """ frequency = defaultdict(int) data = input_to_list_string(data, preprocessing) for text in data: for token in text.split(' '): frequency[token] += 1 if preprocessing: data = map(preprocess, data) texts = [[ token for token in text.split(' ') if frequency[token] > 1 and len(token) > 0 ] for text in data] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] nmf_model = Nmf(corpus, id2word=dictionary, num_topics=num_topics, kappa=kappa, minimum_probability=minimum_probability, w_max_iter=w_max_iter, w_stop_condition=w_stop_condition, h_max_iter=h_max_iter, h_stop_condition=h_stop_condition, eval_every=eval_every, normalize=normalize, random_state=random_state) self.model = nmf_model self.dictionary = dictionary self.corpus_predictions = nmf_model[corpus] return 'success'
# Create a list of the topic numbers we want to try topic_nums = list(np.arange(5, 75 + 1, 5)) # Run the nmf model and calculate the coherence score # for each number of topics coherence_scores = [] for i in tqdm(range(0, len(topic_nums))): for num in topic_nums: nmf = Nmf(corpus=corpus, num_topics=num, id2word=dictionary, chunksize=2000, passes=5, kappa=.1, minimum_probability=0.01, w_max_iter=300, w_stop_condition=0.0001, h_max_iter=100, h_stop_condition=0.001, eval_every=10, normalize=True, random_state=42) time.sleep(0.5) # Run the coherence model to get the score cm = CoherenceModel(model=nmf, texts=texts, dictionary=dictionary, coherence='c_v') coherence_scores.append(round(cm.get_coherence(), 5))
# TODO Several useful descriptive insight methods in this article # https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#15visualizethetopicskeywords # https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/#14.-pyLDAVis if not os.path.exists("img"): os.mkdir("img") ##################################################################################### # Wordcloud by topic ##################################################################################### # Load model(s) # lda = LdaModel.load("models/lda") dct = gensim.utils.SaveLoad.load("models/dct") corpus = gensim.corpora.MmCorpus("models/corpus") nmf = Nmf.load("models/nmf") # Generate word cloud for final model labels = { 1: "Economic activity", 2: "Policy action", 3: "Economic outlook", 4: " Employment", 5: "Financial Markets", 6: "Inflation" } for topic in range(0, NUM_TOPICS): termsnmf = nmf.show_topic(topic, topn=50) # Model returns list of tuples, wordcloud wants a dictionary instead wordcloudnmf = WordCloud( background_color="white").generate_from_frequencies(dict(termsnmf))
def nmf_coherence_scores(text_l, min_df, max_df): ''' Build Gensim NMF model, calculate coherence scores for various numbers of topics, and plot coherence scores against number of topics ''' texts = [word_tokenize(text) for text in text_l] # Create a dictionary dictionary = corpora.Dictionary(texts) # Filter out extremes to limit the number of features dictionary.filter_extremes( no_below=min_df, no_above=max_df ) # Create the bag-of-words format (list of (token_id, token_count)) corpus = [dictionary.doc2bow(text) for text in texts] # Create a list of the topic numbers we want to try topic_nums = list(np.arange(5, 75 + 1, 5)) # Run NMF model and calculate coherence score for each number of topics coherence_scores = [] for num in topic_nums: nmf = Nmf( corpus=corpus, num_topics=num, id2word=dictionary, chunksize=2000, passes=5, kappa=.1, minimum_probability=0.01, w_max_iter=300, w_stop_condition=0.0001, h_max_iter=100, h_stop_condition=0.001, eval_every=10, normalize=True, random_state=42 ) cm = CoherenceModel( model=nmf, texts=texts, dictionary=dictionary, coherence='c_v' ) coherence_scores.append(round(cm.get_coherence(), 5)) # Get the number of topics with the highest coherence score scores = list(zip(topic_nums, coherence_scores)) best_num_topics = sorted(scores, key=operator.itemgetter(1), reverse=True)[0][0] print(scores) print(best_num_topics) # Plot coherence scores plt.figure(figsize=(8,5)) plt.plot(topic_nums, coherence_scores, color='r', linewidth=2) plt.title('NMF Model Optimization: Coherence Scores', fontsize=16) plt.xlabel('Number of topics', fontsize=14) plt.ylabel('Coherence score', fontsize=14) plt.xticks(np.arange(5,80,5), fontsize=12) plt.yticks(fontsize=12) plt.show()