예제 #1
0
    def test(self):
        train_ds = UDDataSet('data/en-ud-train.conllu')
        test_ds = UDDataSet('data/en-ud-dev.conllu', train_ds)

        self.assertEqual(UNK, train_ds.lookup_word('UNKDSF'))
        self.assertEqual(train_ds.pos, test_ds.pos)
        self.assertEqual(train_ds.words, test_ds.words)

        self.assertEqual(6, train_ds.lookup_pos("ADP"))
예제 #2
0
    def test_init(self):
        hmm = HMM(UDDataSet('data/en-ud-train.conllu'))

        self.assertEqual(17, hmm.num_state)
        self.assertEqual(17, hmm.bos_idx)
        self.assertEqual(18, hmm.eos_idx)

        for i in range(hmm.num_state):
            self.assertAlmostEqual(-12.228919653600784,
                                   hmm.emission_counter[(i, -1)])
예제 #3
0
    def test_cond_prop(self):
        hmm = HMM(UDDataSet("data/en-ud-train.conllu"))

        cprob = hmm.cond_prob(4, 3, 7)
        self.assertAlmostEqual(1.00, sum([np.exp(i) for i in cprob]))

        often = hmm.dataset.lookup_word("often")
        adj = hmm.dataset.lookup_pos("ADJ")
        propn = hmm.dataset.lookup_pos("PROPN")
        cprob = hmm.cond_prob(often, adj, propn)
        self.assertAlmostEqual(1.00, sum([np.exp(i) for i in cprob]))
예제 #4
0
    def test_normalize(self):
        hmm = HMM(UDDataSet("data/en-ud-train.conllu"))
        counter = {}
        counter['a'] = 1
        counter['b'] = 2
        counter['c'] = 3
        counter['d'] = 4
        hmm.normalize(counter)

        self.assertEqual(np.log(0.1), counter['a'])
        self.assertEqual(np.log(0.2), counter['b'])
        self.assertEqual(np.log(0.3), counter['c'])
        self.assertEqual(np.log(0.4), counter['d'])
예제 #5
0
    def test_sample(self):
        train_ds = UDDataSet("data/en-ud-train.conllu")
        dev_ds = UDDataSet("data/en-ud-dev.conllu", train_ds)

        gibbs = Gibbs(HMM(train_ds))

        sample = gibbs.sample(dev_ds.sentences()[50], 10)
        print([train_ds.idx2pos(i) for i in sample])
예제 #6
0
 def test_find_word(self):
     train_ds = UDDataSet('data/en-ud-train.conllu')
     print(train_ds.idx2word(121))
     print(train_ds.lookup_pos("PART"))
예제 #7
0
import numpy as np

from dataset import UDDataSet
from gibbs import Gibbs
from hmm import HMM

train_ds = UDDataSet('data/en-ud-train.conllu')
dev_ds = UDDataSet('data/en-ud-dev.conllu', train_ds)

hmm = HMM(train_ds)
gibbs = Gibbs(hmm)


def predict(iteration):
    num_total_tag = 0
    num_correct_tag = 0

    for sentence in dev_ds.sentences():
        predict_tag = gibbs.sample(sentence, iteration)
        num_total_tag += len(sentence)
        gt_tag = np.array([word[1] for word in sentence.words])
        num_correct_tag += (predict_tag == gt_tag).sum()

    #print("Tag accuracy: %.4f" % (num_correct_tag / num_total_tag))
    return num_correct_tag / num_total_tag


k = [1, 2, 5, 10, 100, 500, 1000, 2000]

for ite in k:
    print("%d & %.4f\\\\\\hline" % (ite, predict(ite)))
예제 #8
0
    def test_error_cond_prop(self):
        hmm = HMM(UDDataSet("data/en-ud-train.conllu"))

        cprob = hmm.cond_prob(121, 4, 16)
        self.assertAlmostEqual(1.00, sum([np.exp(i) for i in cprob]))
예제 #9
0
from dataset import UDDataSet
from hmm import HMM

train_ds = UDDataSet('data/en-ud-train.conllu')
dev_ds = UDDataSet('data/en-ud-dev.conllu')

hmm = HMM(train_ds)

adj_idx = train_ds.pos2idx('ADJ')
adj_words = sorted([(key[1], hmm.emission_counter[key])
                    for key in hmm.emission_counter if key[0] == adj_idx],
                   key=lambda x: x[1],
                   reverse=True)[:10]
print("\n".join([
    "%s & %.6f \\\\\\hline" % (train_ds.idx2word(w[0]), w[1])
    for w in adj_words
]))

propn_idx = train_ds.pos2idx('PROPN')
transit = sorted([(key[1], hmm.transition_counter[key])
                  for key in hmm.transition_counter if key[0] == propn_idx],
                 key=lambda x: x[1],
                 reverse=True)[:5]

print("\n".join([
    "%s & %.6f\\\\\\hline" % (train_ds.idx2pos(t[0]), t[1]) for t in transit
]))