def test_unknown(self): hmm = MLHMM(2, self.tagged_sents) known = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón'} for w in known: self.assertFalse(hmm.unknown(w)) unknown = {'perro', 'salame'} for w in unknown: self.assertTrue(hmm.unknown(w))
def test_tag_prob_2gram(self): hmm = MLHMM(2, self.tagged_sents, addone=False) y = 'D N V N P'.split() p = hmm.tag_prob(y) tag_prob = 0.5 * 0.5 self.assertAlmostEqual(p, tag_prob) lp = hmm.tag_log_prob(y) self.assertAlmostEqual(lp, log2(tag_prob))
def test_tag_prob_1gram(self): hmm = MLHMM(1, self.tagged_sents, addone=False) y = 'D N V N P'.split() p = hmm.tag_prob(y) # D V P and </s> have prob 2.0 / 12.0, N has prob 4.0 / 12.0. tag_prob = (2.0 / 12.0)**4 * \ (4.0 / 12.0)**2 self.assertAlmostEqual(p, tag_prob) lp = hmm.tag_log_prob(y) self.assertAlmostEqual(lp, log2(tag_prob))
def test_tcount_1gram(self): hmm = MLHMM(1, self.tagged_sents) tcount = { (): 12, ('D', ): 2, ('N', ): 4, ('V', ): 2, ('P', ): 2, ('</s>', ): 2, } for gram, c in tcount.items(): self.assertEqual(hmm.tcount(gram), c, gram)
def test_trans_prob_2gram(self): hmm = MLHMM(2, self.tagged_sents, addone=False) probs = { ('D', ('<s>', )): 1.0, ('N', ('D', )): 1.0, ('V', ('N', )): 0.5, ('N', ('V', )): 1.0, ('P', ('N', )): 0.5, ('</s>', ('P', )): 1.0, } for params, p in probs.items(): self.assertAlmostEqual(hmm.trans_prob(*params), p, msg=params)
def test_trans_prob_2gram(self): hmm = MLHMM(2, self.tagged_sents, addone=False) probs = { ('D', ('<s>',)): 1.0, ('N', ('D',)): 1.0, ('V', ('N',)): 0.5, ('N', ('V',)): 1.0, ('P', ('N',)): 0.5, ('</s>', ('P',)): 1.0, } for params, p in probs.items(): self.assertAlmostEqual(hmm.trans_prob(*params), p, msg=params)
def test_tcount_1gram(self): hmm = MLHMM(1, self.tagged_sents) tcount = { (): 12, ('D',): 2, ('N',): 4, ('V',): 2, ('P',): 2, ('</s>',): 2, } for gram, c in tcount.items(): self.assertEqual(hmm.tcount(gram), c, gram)
def test_prob_2gram(self): hmm = MLHMM(2, self.tagged_sents, addone=False) x = 'el gato come pescado .'.split() y = 'D N V N P'.split() p = hmm.prob(x, y) # V after N and P after N have prob 0.5. the rest is 1.0. tag_prob = 0.5 * 0.5 # probs for el/D gato/N come/V pescado/N ./P out_prob = 0.5 * 0.25 * 1.0 * 0.25 * 1.0 self.assertAlmostEqual(p, tag_prob * out_prob) lp = hmm.log_prob(x, y) self.assertAlmostEqual(lp, log2(tag_prob) + log2(out_prob))
def test_prob_1gram(self): hmm = MLHMM(1, self.tagged_sents, addone=False) x = 'el gato come pescado .'.split() y = 'D N V N P'.split() p = hmm.prob(x, y) # D V P and </s> have prob 2.0 / 12.0, N has prob 4.0 / 12.0. tag_prob = (2.0 / 12.0)**4 * \ (4.0 / 12.0)**2 # probs for el/D gato/N come/V pescado/N ./P out_prob = 0.5 * 0.25 * 1.0 * 0.25 * 1.0 self.assertAlmostEqual(p, tag_prob * out_prob) lp = hmm.log_prob(x, y) self.assertAlmostEqual(lp, log2(tag_prob) + log2(out_prob))
def test_viterbi_tagger(self): hmm = MLHMM(2, self.tagged_sents, addone=False) # XXX: or directly test hmm.tag? tagger = ViterbiTagger(hmm) y = tagger.tag('el gato come pescado .'.split()) pi = { 0: { ('<s>', ): (0.0, []), }, 1: { # 0.5 for el/D ( 'D', ): (log2(0.5), ['D']), }, 2: { # 0.25 for gato/N ( 'N', ): (log2(0.5 * 0.25), ['D', 'N']), }, 3: { # 0.5 for V after N ( 'V', ): (log2(0.5 * 0.25 * 0.5), ['D', 'N', 'V']), }, 4: { # 0.25 for pescado/N ( 'N', ): (log2(0.5 * 0.25 * 0.5 * 0.25), ['D', 'N', 'V', 'N']), }, 5: { # 0.5 for P after N ( 'P', ): (log2(0.5 * 0.25 * 0.5 * 0.25 * 0.5), ['D', 'N', 'V', 'N', 'P']), } } self.assertEqualPi(tagger._pi, pi) self.assertEqual(y, 'D N V N P'.split())
files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml' corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files) sents = list(corpus.tagged_sents()) # order of the model m = str(opts['-m']) # train the model filename = opts['-o'] if m == "base": print("Baseline Model selected") model = BaselineTagger(tagged_sents=sents) elif m == "mlhmm": n = int(opts['-n']) print("Maximum Likelihood Hidden Markov Model selected, n=%d" % n) model = MLHMM(n=n, tagged_sents=sents, addone=True) elif m == 'memm': n = int(opts['-n']) c = str(opts['-c']) if c not in ['logreg', 'nb', 'svc']: print("Bad classifier type, use --help option for help") exit() print("Maximum Entropy Markov Model selected, n=%d, c=%s" % (n, c)) model = MEMM(n=n, tagged_sents=sents, classifier=c) else: print("Bad model type, use --help option for help") exit() # save it f = open(filename, 'wb') pickle.dump(model, f)
def MLHMM_trainer(tagged_sents): return MLHMM(n, tagged_sents)