def test_sent_log_prob_3and4gram(self): ngram = NGram(3, self.sents2) ngram2 = NGram(4, self.sents2) sents = { 'el gato come pescado nuevo .': float('-inf'), # 'nuevo' unseen 'la la la': float('-inf'), # 'la' after 'la' unseen # after 'pescado': 'viejo' and 'fresco' have prob 0.5. 'el gato come pescado fresco . ': log2(0.5), 'el gato come pescado viejo . ': log2(0.5) } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent) self.assertAlmostEqual(ngram2.sent_log_prob(sent.split()), prob, msg=sent)
def test_sent_log_prob_1gram(self): ngram = NGram(1, self.sents) log2 = lambda x: log(x, 2) sents = { # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12. 'el gato come pescado .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0), 'la gata come salmón .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0), 'el gato come salame .': float('-inf'), # 'salame' unseen 'la la la': log2(1 / 6.0) + 3 * log2(1 / 12.0), } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_sent_log_prob_1gram(self): ngram = NGram(1, self.sents) def log2(x): return log(x, 2) sents = { # 'come', '.' and '</s>' have prob 1/6, the rest have 1/12. 'el gato come pescado .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0), 'la gata come salmón .': 3 * log2(1 / 6.0) + 3 * log2(1 / 12.0), 'el gato come salame .': -inf, # 'salame' unseen 'la la la': log2(1 / 6.0) + 3 * log2(1 / 12.0), } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_sent_log_prob_2gram(self): ngram = NGram(2, self.sents) log2 = lambda x: log(x, 2) sents = { # after '<s>': 'el' and 'la' have prob 0.5. # after 'come': 'pescado' and 'salmón' have prob 0.5. 'el gato come pescado .': 2 * log2(0.5), 'la gata come salmón .': 2 * log2(0.5), 'el gato come salmón .': 2 * log2(0.5), 'la gata come pescado .': 2 * log2(0.5), 'el gato come salame .': float('-inf'), # 'salame' unseen 'la la la': float('-inf'), # 'la' after 'la' unseen } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_sent_log_prob_2gram(self): ngram = NGram(2, self.sents) def log2(x): return log(x, 2) sents = { # after '<s>': 'el' and 'la' have prob 0.5. # after 'come': 'pescado' and 'salmón' have prob 0.5. 'el gato come pescado .': 2 * log2(0.5), 'la gata come salmón .': 2 * log2(0.5), 'el gato come salmón .': 2 * log2(0.5), 'la gata come pescado .': 2 * log2(0.5), 'el gato come salame .': -inf, # 'salame' unseen 'la la la': -inf, # 'la' after 'la' unseen } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)
def test_sent_log_prob_3gram(self): ngram = NGram(3, self.sents) log2 = lambda x: log(x, 2) sents = { # after '<s>': 'el' and 'la' have prob 0.5. # after 'come': 'pescado' and 'salmón' have prob 0.5. 'el gato come pescado .': log2(0.5), 'la gata come salmón .': log2(0.5), 'el gato come salmón .': float('-inf'), 'la gata come pescado .': float('-inf'), 'el gato come salame .': float('-inf'), # 'salame' unseen 'la la la': float('-inf'), # 'la' after 'la' unseen } for sent, prob in sents.items(): self.assertAlmostEqual(ngram.sent_log_prob(sent.split()), prob, msg=sent)