def test_cond_prob_normalization_2gram_no_addone(self):
        model = BackOffNGram(2, self.sents, beta=0.5, addone=False)

        alpha = 1.0 - (1.0 - 0.5) / 1.0
        denom = model.denom(('el', ))
        self.assertAlmostEqual(model.alpha(('el', )), alpha)

        probs = {
            ('gato', 'el'): (1.0 - 0.5) / 1.0,
            # back-off to the unigrams:
            ('el', 'el'): alpha * 1.0 / (12.0 * denom),
            ('come', 'el'): alpha * 2.0 / (12.0 * denom),
            ('pescado', 'el'): alpha * 1.0 / (12.0 * denom),
            ('.', 'el'): alpha * 2.0 / (12.0 * denom),
            ('</s>', 'el'): alpha * 2.0 / (12.0 * denom),
            ('la', 'el'): alpha * 1.0 / (12.0 * denom),
            ('gata', 'el'): alpha * 1.0 / (12.0 * denom),
            ('salmón', 'el'): alpha * 1.0 / (12.0 * denom),
        }
        for (token, prev), p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token, (prev, )),
                                   p,
                                   msg=(token, prev))

        # the sum is one:
        prob_sum = sum(probs.values())
        self.assertAlmostEqual(prob_sum, 1.0)
示例#2
0
    def test_cond_prob_normalization_2gram_no_addone(self):
        model = BackOffNGram(2, self.sents, beta=0.5, addone=False)

        alpha = 1.0 - (1.0 - 0.5) / 1.0
        denom = model.denom(('el',))
        self.assertEqual(model.alpha(('el',)), alpha)

        probs = {
            ('gato', 'el'): (1.0 - 0.5) / 1.0,
            # back-off to the unigrams:
            ('el', 'el'): alpha * 1.0 / (12.0 * denom),
            ('come', 'el'): alpha * 2.0 / (12.0 * denom),
            ('pescado', 'el'): alpha * 1.0 / (12.0 * denom),
            ('.', 'el'): alpha * 2.0 / (12.0 * denom),
            ('</s>', 'el'): alpha * 2.0 / (12.0 * denom),
            ('la', 'el'): alpha * 1.0 / (12.0 * denom),
            ('gata', 'el'): alpha * 1.0 / (12.0 * denom),
            ('salmón', 'el'): alpha * 1.0 / (12.0 * denom),
        }
        for (token, prev), p in probs.items():
            self.assertEqual(model.cond_prob(token, [prev]), p, (token, prev))

        # the sum is one:
        prob_sum = sum(probs.values())
        self.assertTrue(abs(prob_sum - 1.0) < 1e-10, prob_sum)
示例#3
0
    def test_cond_prob_2gram_no_discount_no_addone(self):
        model = BackOffNGram(2, self.sents, beta=0.0, addone=False)

        probs = {
            ('pescado', 'come'): 1.0 / 2.0,
            ('salmón', 'come'): 1.0 / 2.0,
            ('salame', 'come'): 0.0,  # back-off to the unigram that is 0.0
        }
        for (token, prev), p in probs.items():
            self.assertEqual(model.cond_prob(token, [prev]), p, (token))
示例#4
0
    def test_cond_prob_2gram_no_discount_no_addone(self):
        model = BackOffNGram(2, self.sents, beta=0.0, addone=False)

        probs = {
            ('pescado', 'come'): 1.0 / 2.0,
            ('salmón', 'come'): 1.0 / 2.0,
            ('salame', 'come'): 0.0,  # back-off to the unigram that is 0.0
        }
        for (token, prev), p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token, (prev,)), p, msg=(token))
    def test_cond_prob_1gram_no_addone(self):
        model = BackOffNGram(1, self.sents, beta=0.5, addone=False)

        # behaves just like unsmoothed n-gram
        probs = {
            'pescado': 1 / 12.0,
            'come': 2 / 12.0,
            'salame': 0.0,
        }
        for token, p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token), p, msg=token)
示例#6
0
    def test_cond_prob_1gram_no_addone(self):
        model = BackOffNGram(1, self.sents, beta=0.5, addone=False)

        # behaves just like unsmoothed n-gram
        probs = {
            'pescado': 1 / 12.0,
            'come': 2 / 12.0,
            'salame': 0.0,
        }
        for token, p in probs.items():
            self.assertEqual(model.cond_prob(token), p)
    def test_init_2gram(self):
        model = BackOffNGram(2, self.sents, beta=0.5)

        A = {
            ('<s>', ): {'el', 'la'},
            ('el', ): {'gato'},
            ('gato', ): {'come'},
            ('come', ): {'pescado', 'salmón'},
            ('pescado', ): {'.'},
            ('.', ): {'</s>'},
            ('la', ): {'gata'},
            ('gata', ): {'come'},
            ('salmón', ): {'.'},
        }
        for tokens, Aset in A.items():
            self.assertEqual(model.A(tokens), Aset, tokens)

        # missing probability mass
        alpha = {
            ('<s>', ): 2 * 0.5 / 2,
            ('el', ): 1 * 0.5 / 1,
            ('gato', ): 1 * 0.5 / 1,
            ('come', ): 2 * 0.5 / 2,
            ('pescado', ): 1 * 0.5 / 1,
            ('.', ): 1 * 0.5 / 2,
            ('la', ): 1 * 0.5 / 1,
            ('gata', ): 1 * 0.5 / 1,
            ('salmón', ): 1 * 0.5 / 1,
        }
        for tokens, a in alpha.items():
            self.assertAlmostEqual(model.alpha(tokens), a, msg=tokens)

        # normalization factor
        denom = {
            ('<s>', ):
            1.0 - model.cond_prob('el') - model.cond_prob('la'),
            ('el', ):
            1.0 - model.cond_prob('gato'),
            ('gato', ):
            1.0 - model.cond_prob('come'),
            ('come', ):
            1.0 - model.cond_prob('pescado') - model.cond_prob('salmón'),
            ('pescado', ):
            1.0 - model.cond_prob('.'),
            ('.', ):
            1.0 - model.cond_prob('</s>'),
            ('la', ):
            1.0 - model.cond_prob('gata'),
            ('gata', ):
            1.0 - model.cond_prob('come'),
            ('salmón', ):
            1.0 - model.cond_prob('.'),
        }
        for tokens, d in denom.items():
            self.assertAlmostEqual(model.denom(tokens), d, msg=tokens)
示例#8
0
    def test_init_2gram(self):
        model = BackOffNGram(2, self.sents, beta=0.5)

        A = {
            ('<s>',): {'el', 'la'},
            ('el',): {'gato'},
            ('gato',): {'come'},
            ('come',): {'pescado', 'salmón'},
            ('pescado',): {'.'},
            ('.',): {'</s>'},
            ('la',): {'gata'},
            ('gata',): {'come'},
            ('salmón',): {'.'},
        }
        for tokens, Aset in A.items():
            self.assertEqual(model.A(tokens), Aset, tokens)

        # missing probability mass
        alpha = {
            ('<s>',): 2 * 0.5 / 2,
            ('el',): 1 * 0.5 / 1,
            ('gato',): 1 * 0.5 / 1,
            ('come',): 2 * 0.5 / 2,
            ('pescado',): 1 * 0.5 / 1,
            ('.',): 1 * 0.5 / 2,
            ('la',): 1 * 0.5 / 1,
            ('gata',): 1 * 0.5 / 1,
            ('salmón',): 1 * 0.5 / 1,
        }
        for tokens, a in alpha.items():
            self.assertEqual(model.alpha(tokens), a, tokens)

        # normalization factor
        denom = {
            ('<s>',): 1.0 - model.cond_prob('el') - model.cond_prob('la'),
            ('el',): 1.0 - model.cond_prob('gato'),
            ('gato',): 1.0 - model.cond_prob('come'),
            ('come',): 1.0 - model.cond_prob('pescado') - model.cond_prob('salmón'),
            ('pescado',): 1.0 - model.cond_prob('.'),
            ('.',): 1.0 - model.cond_prob('</s>'),
            ('la',): 1.0 - model.cond_prob('gata'),
            ('gata',): 1.0 - model.cond_prob('come'),
            ('salmón',): 1.0 - model.cond_prob('.'),
        }
        for tokens, d in denom.items():
            self.assertEqual(model.denom(tokens), d, tokens)