Exemplo n.º 1
0
def get_top_words(wc, train_sample_size, top_num = 10):
	top_words = {}
	for word in wc:
		if not word in stopwords.words():
			for decade in wc[word]:
				p = P2.p_x_given_y(wc, word, decade, train_sample_size)
				p_min = 1

				for n_decade in wc[word]:
					if not n_decade == decade:
						p_min = min(P2.p_x_given_y(wc, word, n_decade, train_sample_size), p_min)

				ratio = p/p_min
				# ratio = wc[word][decade]/min(wc[word].values())

				if decade in top_words:
					w = top_words[decade]
					if len(w) < top_num:
						top_words[decade][word] = ratio
					else:
						mw = min(w, key=w.get)
						if ratio > w[mw]:
							del top_words[decade][mw]
							top_words[decade][word] = ratio
				else:
					top_words[decade] = {}
					top_words[decade][word] = wc[word][decade]
	return top_words