def search_models(tfidf_corpus, min_topics, max_topics, threshold=.333): g.debug("Building NMF topics...") # nmf_models = [] costs = [] H_similarities = [] W_similarities = [] tfidf_similarities = [] max_strength = [] min_strength = [] avg_strength = [] n_models = max_topics - min_topics + 1 g.progress_bar(0, n_models) try: for i in range(min_topics, max_topics + 1): nmf, W, H = nmf_model(tfidf_corpus, i, max_iter=666, no_output=True) # nmf_models.append(nmf) costs.append(nmf.reconstruction_err_**2) H_similarities.append( 1 - pairwise_distances(H, metric="cosine", n_jobs=-1).mean()) W_similarities.append( 1 - pairwise_distances(W, metric="cosine", n_jobs=-1).mean()) W_normalized = W / W.max(axis=0) tfidf_similarities.append( np.mean([ pairwise_distances( tfidf_corpus[W_normalized[:, topic_i] > threshold].A, metric="cosine", n_jobs=-1).mean() for topic_i in range(i) if (W_normalized[:, topic_i] > threshold).any() ])) values = np.array([W[x, y] for x, y in np.transpose(W.nonzero())]) max_strength.append(values.max()) min_strength.append(values.min()) avg_strength.append(values.mean()) g.progress_bar(i - min_topics + 1, n_models, text=f"{nmf.n_iter_} iterations") except KeyboardInterrupt: completed = len(tfidf_similarities) costs = costs[:completed] H_similarities = H_similarities[:completed] W_similarities = W_similarities[:completed] max_strength = max_strength[:completed] min_strength = min_strength[:completed] avg_strength = avg_strength[:completed] return costs, H_similarities, W_similarities, tfidf_similarities, max_strength, min_strength, avg_strength
def build_word_clouds(corpus_tfidf, corpus_topics, H, word_list, table_name): g.debug("Generating topic word clouds...") n_topics = H.shape[0] completed = 0 g.progress_bar(completed, n_topics) topic_tfidf_weights = get_tfidf_topic_weights(corpus_tfidf, corpus_topics, n_topics) topic_top_tfidf_words_i = np.argsort(topic_tfidf_weights, axis=1)[:, ::-1] topic_top_nmf_words_i = np.argsort(H, axis=1)[:, ::-1] for topic_i in range(n_topics): # nmf wordcloud wc = WordCloud(background_color="black", max_words=333, width=1000, height=500) wc.fit_words({ word_list[word_i]: H[topic_i, word_i] for word_i in topic_top_nmf_words_i[topic_i] if H[topic_i, word_i] }) wc.to_file(f"../output/{table_name}/nmf/{topic_i}_nmf_wordcloud.png") # an empty topic... if not topic_tfidf_weights[topic_i].sum(): continue # tf-idf wordcloud wc = WordCloud(background_color="black", max_words=333, width=1000, height=500) wc.fit_words({ word_list[word_i]: topic_tfidf_weights[topic_i, word_i] for word_i in topic_top_tfidf_words_i[topic_i] if topic_tfidf_weights[topic_i, word_i] }) wc.to_file(f"../output/{table_name}/nmf/{topic_i}_tfidf_wordcloud.png") completed += 1 g.progress_bar(completed, n_topics) g.debug(f" -> {n_topics} word clouds generated!", 1)
def cache_wordclouds(corpus, vocabulary, H, W): n_topics = H.shape[0] g.debug(f"Caching word clouds for {n_topics} topics...") topic_tfidf_weights = get_tfidf_topic_weights(corpus.tfidf_corpus, W) total = n_topics * 2 complete = 0 g.progress_bar(complete, total) for topic_i in range(n_topics): # nmf wordcloud wc = build_word_cloud(H[topic_i], vocabulary) wc.to_file(f"output/wordclouds/{str(topic_i).rjust(3, '0')}_nmf.png") complete += 1 g.progress_bar(complete, total) # tfidf wordcloud if topic_tfidf_weights[topic_i].sum(): wc = build_word_cloud(topic_tfidf_weights[topic_i], vocabulary) else: # an empty topic... wc = build_word_cloud([1], ["This topic was empty"]) wc.to_file(f"output/wordclouds/{str(topic_i).rjust(3, '0')}_tfidf.png") complete += 1 g.progress_bar(complete, total) g.debug(" -> Done", 1)
def sumarize_corpus(corpus, vectorizer, n_sentences=10): """ Summarizes an entire corpus. Displays a progress bar. :param corpus: The corpus to be summarized :param vectorizer: The TF-IDF vectorizer to be used for feature extraction. :param n_sentences: Number of sentences to include in the summary. :return: A corpus of summaries """ g.debug("Summarizing documents...") summaries = [] n_docs = len(corpus) completed = 0 for doc in corpus: summaries.append(summarize_doc(doc, vectorizer, n_sentences)) completed += 1 g.progress_bar(completed, n_docs, 1) g.debug(f" -> {len(summaries)} documents summarized!", 1) return summaries