def test(self): train_ds = UDDataSet('data/en-ud-train.conllu') test_ds = UDDataSet('data/en-ud-dev.conllu', train_ds) self.assertEqual(UNK, train_ds.lookup_word('UNKDSF')) self.assertEqual(train_ds.pos, test_ds.pos) self.assertEqual(train_ds.words, test_ds.words) self.assertEqual(6, train_ds.lookup_pos("ADP"))
def test_init(self): hmm = HMM(UDDataSet('data/en-ud-train.conllu')) self.assertEqual(17, hmm.num_state) self.assertEqual(17, hmm.bos_idx) self.assertEqual(18, hmm.eos_idx) for i in range(hmm.num_state): self.assertAlmostEqual(-12.228919653600784, hmm.emission_counter[(i, -1)])
def test_cond_prop(self): hmm = HMM(UDDataSet("data/en-ud-train.conllu")) cprob = hmm.cond_prob(4, 3, 7) self.assertAlmostEqual(1.00, sum([np.exp(i) for i in cprob])) often = hmm.dataset.lookup_word("often") adj = hmm.dataset.lookup_pos("ADJ") propn = hmm.dataset.lookup_pos("PROPN") cprob = hmm.cond_prob(often, adj, propn) self.assertAlmostEqual(1.00, sum([np.exp(i) for i in cprob]))
def test_normalize(self): hmm = HMM(UDDataSet("data/en-ud-train.conllu")) counter = {} counter['a'] = 1 counter['b'] = 2 counter['c'] = 3 counter['d'] = 4 hmm.normalize(counter) self.assertEqual(np.log(0.1), counter['a']) self.assertEqual(np.log(0.2), counter['b']) self.assertEqual(np.log(0.3), counter['c']) self.assertEqual(np.log(0.4), counter['d'])
def test_sample(self): train_ds = UDDataSet("data/en-ud-train.conllu") dev_ds = UDDataSet("data/en-ud-dev.conllu", train_ds) gibbs = Gibbs(HMM(train_ds)) sample = gibbs.sample(dev_ds.sentences()[50], 10) print([train_ds.idx2pos(i) for i in sample])
def test_find_word(self): train_ds = UDDataSet('data/en-ud-train.conllu') print(train_ds.idx2word(121)) print(train_ds.lookup_pos("PART"))
import numpy as np from dataset import UDDataSet from gibbs import Gibbs from hmm import HMM train_ds = UDDataSet('data/en-ud-train.conllu') dev_ds = UDDataSet('data/en-ud-dev.conllu', train_ds) hmm = HMM(train_ds) gibbs = Gibbs(hmm) def predict(iteration): num_total_tag = 0 num_correct_tag = 0 for sentence in dev_ds.sentences(): predict_tag = gibbs.sample(sentence, iteration) num_total_tag += len(sentence) gt_tag = np.array([word[1] for word in sentence.words]) num_correct_tag += (predict_tag == gt_tag).sum() #print("Tag accuracy: %.4f" % (num_correct_tag / num_total_tag)) return num_correct_tag / num_total_tag k = [1, 2, 5, 10, 100, 500, 1000, 2000] for ite in k: print("%d & %.4f\\\\\\hline" % (ite, predict(ite)))
def test_error_cond_prop(self): hmm = HMM(UDDataSet("data/en-ud-train.conllu")) cprob = hmm.cond_prob(121, 4, 16) self.assertAlmostEqual(1.00, sum([np.exp(i) for i in cprob]))
from dataset import UDDataSet from hmm import HMM train_ds = UDDataSet('data/en-ud-train.conllu') dev_ds = UDDataSet('data/en-ud-dev.conllu') hmm = HMM(train_ds) adj_idx = train_ds.pos2idx('ADJ') adj_words = sorted([(key[1], hmm.emission_counter[key]) for key in hmm.emission_counter if key[0] == adj_idx], key=lambda x: x[1], reverse=True)[:10] print("\n".join([ "%s & %.6f \\\\\\hline" % (train_ds.idx2word(w[0]), w[1]) for w in adj_words ])) propn_idx = train_ds.pos2idx('PROPN') transit = sorted([(key[1], hmm.transition_counter[key]) for key in hmm.transition_counter if key[0] == propn_idx], key=lambda x: x[1], reverse=True)[:5] print("\n".join([ "%s & %.6f\\\\\\hline" % (train_ds.idx2pos(t[0]), t[1]) for t in transit ]))