def test_cond_prob_normalization_2gram_no_addone(self): model = BackOffNGram(2, self.sents, beta=0.5, addone=False) alpha = 1.0 - (1.0 - 0.5) / 1.0 denom = model.denom(('el', )) self.assertAlmostEqual(model.alpha(('el', )), alpha) probs = { ('gato', 'el'): (1.0 - 0.5) / 1.0, # back-off to the unigrams: ('el', 'el'): alpha * 1.0 / (12.0 * denom), ('come', 'el'): alpha * 2.0 / (12.0 * denom), ('pescado', 'el'): alpha * 1.0 / (12.0 * denom), ('.', 'el'): alpha * 2.0 / (12.0 * denom), ('</s>', 'el'): alpha * 2.0 / (12.0 * denom), ('la', 'el'): alpha * 1.0 / (12.0 * denom), ('gata', 'el'): alpha * 1.0 / (12.0 * denom), ('salmón', 'el'): alpha * 1.0 / (12.0 * denom), } for (token, prev), p in probs.items(): self.assertAlmostEqual(model.cond_prob(token, (prev, )), p, msg=(token, prev)) # the sum is one: prob_sum = sum(probs.values()) self.assertAlmostEqual(prob_sum, 1.0)
def test_cond_prob_normalization_2gram_no_addone(self): model = BackOffNGram(2, self.sents, beta=0.5, addone=False) alpha = 1.0 - (1.0 - 0.5) / 1.0 denom = model.denom(('el',)) self.assertEqual(model.alpha(('el',)), alpha) probs = { ('gato', 'el'): (1.0 - 0.5) / 1.0, # back-off to the unigrams: ('el', 'el'): alpha * 1.0 / (12.0 * denom), ('come', 'el'): alpha * 2.0 / (12.0 * denom), ('pescado', 'el'): alpha * 1.0 / (12.0 * denom), ('.', 'el'): alpha * 2.0 / (12.0 * denom), ('</s>', 'el'): alpha * 2.0 / (12.0 * denom), ('la', 'el'): alpha * 1.0 / (12.0 * denom), ('gata', 'el'): alpha * 1.0 / (12.0 * denom), ('salmón', 'el'): alpha * 1.0 / (12.0 * denom), } for (token, prev), p in probs.items(): self.assertEqual(model.cond_prob(token, [prev]), p, (token, prev)) # the sum is one: prob_sum = sum(probs.values()) self.assertTrue(abs(prob_sum - 1.0) < 1e-10, prob_sum)
def test_init_2gram(self): model = BackOffNGram(2, self.sents, beta=0.5) A = { ('<s>', ): {'el', 'la'}, ('el', ): {'gato'}, ('gato', ): {'come'}, ('come', ): {'pescado', 'salmón'}, ('pescado', ): {'.'}, ('.', ): {'</s>'}, ('la', ): {'gata'}, ('gata', ): {'come'}, ('salmón', ): {'.'}, } for tokens, Aset in A.items(): self.assertEqual(model.A(tokens), Aset, tokens) # missing probability mass alpha = { ('<s>', ): 2 * 0.5 / 2, ('el', ): 1 * 0.5 / 1, ('gato', ): 1 * 0.5 / 1, ('come', ): 2 * 0.5 / 2, ('pescado', ): 1 * 0.5 / 1, ('.', ): 1 * 0.5 / 2, ('la', ): 1 * 0.5 / 1, ('gata', ): 1 * 0.5 / 1, ('salmón', ): 1 * 0.5 / 1, } for tokens, a in alpha.items(): self.assertAlmostEqual(model.alpha(tokens), a, msg=tokens) # normalization factor denom = { ('<s>', ): 1.0 - model.cond_prob('el') - model.cond_prob('la'), ('el', ): 1.0 - model.cond_prob('gato'), ('gato', ): 1.0 - model.cond_prob('come'), ('come', ): 1.0 - model.cond_prob('pescado') - model.cond_prob('salmón'), ('pescado', ): 1.0 - model.cond_prob('.'), ('.', ): 1.0 - model.cond_prob('</s>'), ('la', ): 1.0 - model.cond_prob('gata'), ('gata', ): 1.0 - model.cond_prob('come'), ('salmón', ): 1.0 - model.cond_prob('.'), } for tokens, d in denom.items(): self.assertAlmostEqual(model.denom(tokens), d, msg=tokens)
def test_init_2gram(self): model = BackOffNGram(2, self.sents, beta=0.5) A = { ('<s>',): {'el', 'la'}, ('el',): {'gato'}, ('gato',): {'come'}, ('come',): {'pescado', 'salmón'}, ('pescado',): {'.'}, ('.',): {'</s>'}, ('la',): {'gata'}, ('gata',): {'come'}, ('salmón',): {'.'}, } for tokens, Aset in A.items(): self.assertEqual(model.A(tokens), Aset, tokens) # missing probability mass alpha = { ('<s>',): 2 * 0.5 / 2, ('el',): 1 * 0.5 / 1, ('gato',): 1 * 0.5 / 1, ('come',): 2 * 0.5 / 2, ('pescado',): 1 * 0.5 / 1, ('.',): 1 * 0.5 / 2, ('la',): 1 * 0.5 / 1, ('gata',): 1 * 0.5 / 1, ('salmón',): 1 * 0.5 / 1, } for tokens, a in alpha.items(): self.assertEqual(model.alpha(tokens), a, tokens) # normalization factor denom = { ('<s>',): 1.0 - model.cond_prob('el') - model.cond_prob('la'), ('el',): 1.0 - model.cond_prob('gato'), ('gato',): 1.0 - model.cond_prob('come'), ('come',): 1.0 - model.cond_prob('pescado') - model.cond_prob('salmón'), ('pescado',): 1.0 - model.cond_prob('.'), ('.',): 1.0 - model.cond_prob('</s>'), ('la',): 1.0 - model.cond_prob('gata'), ('gata',): 1.0 - model.cond_prob('come'), ('salmón',): 1.0 - model.cond_prob('.'), } for tokens, d in denom.items(): self.assertEqual(model.denom(tokens), d, tokens)