def get_top_words(wc, train_sample_size, top_num = 10): top_words = {} for word in wc: if not word in stopwords.words(): for decade in wc[word]: p = P2.p_x_given_y(wc, word, decade, train_sample_size) p_min = 1 for n_decade in wc[word]: if not n_decade == decade: p_min = min(P2.p_x_given_y(wc, word, n_decade, train_sample_size), p_min) ratio = p/p_min # ratio = wc[word][decade]/min(wc[word].values()) if decade in top_words: w = top_words[decade] if len(w) < top_num: top_words[decade][word] = ratio else: mw = min(w, key=w.get) if ratio > w[mw]: del top_words[decade][mw] top_words[decade][word] = ratio else: top_words[decade] = {} top_words[decade][word] = wc[word][decade] return top_words