Пример #1
0
def topic_purity_maximizer(tfidf, tfidf_vectorizer):
	ntopic_pur_list = []
	for k in xrange(40,60):
		n_topics = k
		nmf, nmf_topic_dict = fit_nmf(tfidf, n_topics, tfidf_vectorizer)
		H, W, mean_topic_con_nmf = get_H_W(nmf, tfidf)
		purity_metric_l = get_metric(W, k)
		ntopic_pur_list.append((purity_metric_l.mean(), k))
	ntopic_pur_list.sort(key = lambda x: x[0])
	return ntopic_pur_list
Пример #2
0
def frob_norm(k):
	'''Return the frobenius norm for a given number of topics'''

	#load tfidf and vectorizer
	tfidf = joblib.load('bills_tfidf_sparse.pkl')
	tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

	#run nmf
	nmf, nmf_topic_dict = fit_nmf(tfidf, k, tfidf_vectorizer)
	print nmf.reconstruction_err_, k
	return (nmf.reconstruction_err_, k)
Пример #3
0
def run_cos_H(k):
	'''Returns the mean cosine similarity between topics of the H matrix'''
	
	tfidf = joblib.load('bills_tfidf_sparse.pkl')
	tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
	nmf, nmf_topic_dict = fit_nmf(tfidf, k, tfidf_vectorizer)
	H = nmf.components_
	d = pairwise_distances(H, metric='cosine')
	idx = np.tril_indices(d.shape[0], k=-1)
	print d[idx].mean(), k
	return (d[idx].mean(), k)
Пример #4
0
def run_topic_coherence(tfidf, reverse_lookup):
	average_coherence_k = []
	for k in xrange(10,41,5):
		print ('running for {} topics...'.format(k))
		score_list = []
		nmf, nmf_topic_dict = fit_nmf(tfidf, 10, tfidf_vectorizer)
		for topic_words in nmf_topic_dict.values():
			val = topic_coherence(tfidf, reverse_lookup, topic_words.split())
			score_list.append(val)
			print (val, topic_words)
		average_coherence_k.append((score_list.mean(), k))
	return average_coherence_k
Пример #5
0
def run_cos_W(k):
	'''Returns the average intertopic cosine similarity and the corresponding k value'''

	#load tfidf and vectorizer
	tfidf = joblib.load('bills_tfidf_sparse.pkl')
	tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

	#run nmf
	nmf, nmf_topic_dict = fit_nmf(tfidf, k, tfidf_vectorizer)

	#get W matrix
	W = nmf.transform(tfidf) 

	#get the mean cosine similarities per topic
	d = get_cos_dist_W(W, k, tfidf)
	print d.mean(), k
	return (d.mean(), k) 
Пример #6
0
	plt.legend()
	plt.show()

if __name__ == '__main__':
    tfidf = joblib.load('bills_tfidf_sparse_full.pkl')
    tfidf_vectorizer = joblib.load('tfidf_vectorizer_full.pkl')
    tf = joblib.load('bills_tf_sparse.pkl')
    tf_vectorizer = joblib.load('tf_vectorizer.pkl')
    n_topics = 300
    ntopic_pur_list = topic_purity_maximizer(tfidf, tfidf_vectorizer)
    #tf = joblib.load('bills_tf_sparse.pkl')
    #tf_vectorizer = joblib.load('tf_vectorizer.pkl')
    W = joblib.load('W_300_full.pkl')
    nmf = joblib.load('nmf_300_full.pkl')
    nmf_topic_dict = joblib.load('nmf_topic_dict_300_full.pkl')
    nmf, nmf_topic_dict = fit_nmf(tfidf, n_topics, tfidf_vectorizer)
    H = nmf.components_ 
    #lda, lda_topic_dict = fit_lda(tf)
	mean_topic_con_nmf = get_H_W(nmf, tfidf)
	reverse_lookup = {word: idx for idx, word in enumerate(np.array(tfidf_vectorizer.get_feature_names()))}
	average_coherence_k = run_topic_coherence(tfidf, reverse_lookup)
	avg_d_k_H_525 = parallel_run_cos_H(range(205,525,5))
	avg_d_k_W_525 = parallel_run_cos_W(range(355,525,5))
	frob_norm_list = parallel_frob_norm(range(30,206,5))