def topic_purity_maximizer(tfidf, tfidf_vectorizer): ntopic_pur_list = [] for k in xrange(40,60): n_topics = k nmf, nmf_topic_dict = fit_nmf(tfidf, n_topics, tfidf_vectorizer) H, W, mean_topic_con_nmf = get_H_W(nmf, tfidf) purity_metric_l = get_metric(W, k) ntopic_pur_list.append((purity_metric_l.mean(), k)) ntopic_pur_list.sort(key = lambda x: x[0]) return ntopic_pur_list
def frob_norm(k): '''Return the frobenius norm for a given number of topics''' #load tfidf and vectorizer tfidf = joblib.load('bills_tfidf_sparse.pkl') tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl') #run nmf nmf, nmf_topic_dict = fit_nmf(tfidf, k, tfidf_vectorizer) print nmf.reconstruction_err_, k return (nmf.reconstruction_err_, k)
def run_cos_H(k): '''Returns the mean cosine similarity between topics of the H matrix''' tfidf = joblib.load('bills_tfidf_sparse.pkl') tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl') nmf, nmf_topic_dict = fit_nmf(tfidf, k, tfidf_vectorizer) H = nmf.components_ d = pairwise_distances(H, metric='cosine') idx = np.tril_indices(d.shape[0], k=-1) print d[idx].mean(), k return (d[idx].mean(), k)
def run_topic_coherence(tfidf, reverse_lookup): average_coherence_k = [] for k in xrange(10,41,5): print ('running for {} topics...'.format(k)) score_list = [] nmf, nmf_topic_dict = fit_nmf(tfidf, 10, tfidf_vectorizer) for topic_words in nmf_topic_dict.values(): val = topic_coherence(tfidf, reverse_lookup, topic_words.split()) score_list.append(val) print (val, topic_words) average_coherence_k.append((score_list.mean(), k)) return average_coherence_k
def run_cos_W(k): '''Returns the average intertopic cosine similarity and the corresponding k value''' #load tfidf and vectorizer tfidf = joblib.load('bills_tfidf_sparse.pkl') tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl') #run nmf nmf, nmf_topic_dict = fit_nmf(tfidf, k, tfidf_vectorizer) #get W matrix W = nmf.transform(tfidf) #get the mean cosine similarities per topic d = get_cos_dist_W(W, k, tfidf) print d.mean(), k return (d.mean(), k)
plt.legend() plt.show() if __name__ == '__main__': tfidf = joblib.load('bills_tfidf_sparse_full.pkl') tfidf_vectorizer = joblib.load('tfidf_vectorizer_full.pkl') tf = joblib.load('bills_tf_sparse.pkl') tf_vectorizer = joblib.load('tf_vectorizer.pkl') n_topics = 300 ntopic_pur_list = topic_purity_maximizer(tfidf, tfidf_vectorizer) #tf = joblib.load('bills_tf_sparse.pkl') #tf_vectorizer = joblib.load('tf_vectorizer.pkl') W = joblib.load('W_300_full.pkl') nmf = joblib.load('nmf_300_full.pkl') nmf_topic_dict = joblib.load('nmf_topic_dict_300_full.pkl') nmf, nmf_topic_dict = fit_nmf(tfidf, n_topics, tfidf_vectorizer) H = nmf.components_ #lda, lda_topic_dict = fit_lda(tf) mean_topic_con_nmf = get_H_W(nmf, tfidf) reverse_lookup = {word: idx for idx, word in enumerate(np.array(tfidf_vectorizer.get_feature_names()))} average_coherence_k = run_topic_coherence(tfidf, reverse_lookup) avg_d_k_H_525 = parallel_run_cos_H(range(205,525,5)) avg_d_k_W_525 = parallel_run_cos_W(range(355,525,5)) frob_norm_list = parallel_frob_norm(range(30,206,5))