def gen(): #for lang, corpus in corpora.ud_corpora.items(): for lang, corpus in [['en', corpora.ud_corpora['en']]]: sentences = list(corpus.sentences(fix_content_head=False)) for k in range(0, max_k): h, _ = skip_pmi(c, sentences, k, by_deptype=True) for deptype, pmis in h.items(): yield lang, k, deptype, mean(pmis), count(pmis)
def hdmi_sweep(): d = {} for lang, corpus in corpora.ud_corpora.items(): sentences = list(corpus.sentences(fix_content_head=False)) d[lang] = { 'hd_mi': hdmi(cond.get_pos, sentences), 'hd_n': count(hd(cond.nothing, sentences)), 'gd_mi': gdmi(cond.get_pos, sentences), 'gd_n': count(gd(cond.nothing, sentences)), 'ss_mi': ssmi(cond.get_pos, sentences), 'ss_n': count(ss(cond.nothing, sentences)), } for i in range(0, 25): d[lang]['a_%d_mi' % i] = adjacents_mi(cond.get_pos, sentences, i) d[lang]['a_%d_n' % i] = count(adjacents(cond.nothing, sentences, i)) df = pd.DataFrame(d).T df['lang'] = df.index return df
def entropy(self, choices): return log(count(choices))