class LidstoneBigramTests(unittest.TestCase): """unit tests for Lidstone class""" score_tests = [ # count(d | c) = 1 # *count(d | c) = 1.1 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 1.8 ("d", ["c"], 1.1 / 1.8), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 0.8 = 14.8 # count("a") = 2 # *count("a") = 2.1 ("a", None, 2.1 / 14.8), # in vocabulary but unseen # count("z") = 0 # *count("z") = 0.1 ("z", None, 0.1 / 14.8), # out of vocabulary should use "UNK" score # count("<UNK>") = 3 # *count("<UNK>") = 3.1 ("y", None, 3.1 / 14.8), ] def setUp(self): vocab, training_text = _prepare_test_data(2) self.model = Lidstone(0.1, 2, vocabulary=vocab) self.model.fit(training_text) def test_gamma(self): self.assertEqual(0.1, self.model.gamma) def test_entropy_perplexity(self): text = [ ("<s>", "a"), ("a", "c"), ("c", "<UNK>"), ("<UNK>", "d"), ("d", "c"), ("c", "</s>"), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # <s>, a = 0.3929, -1.3479 # a, c = 0.0357, -4.8074 # c, UNK = 0.0(5), -4.1699 # UNK, d = 0.0263, -5.2479 # d, c = 0.0357, -4.8074 # c, </s> = 0.0(5), -4.1699 # TOTAL logscore: −24.5504 # - AVG logscore: 4.0917 H = 4.0917 perplexity = 17.0504 self.assertAlmostEqual(H, self.model.entropy(text), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
class TestLidstoneBigram(metaclass=ParametrizedTests): """Unit tests for Lidstone class""" score_tests = [ # count(d | c) = 1 # *count(d | c) = 1.1 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 1.8 ("d", ["c"], 1.1 / 1.8), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 0.8 = 14.8 # count("a") = 2 # *count("a") = 2.1 ("a", None, 2.1 / 14.8), # in vocabulary but unseen # count("z") = 0 # *count("z") = 0.1 ("z", None, 0.1 / 14.8), # out of vocabulary should use "UNK" score # count("<UNK>") = 3 # *count("<UNK>") = 3.1 ("y", None, 3.1 / 14.8), ] @classmethod def setup_method(self): vocab, training_text = _prepare_test_data(2) self.model = Lidstone(0.1, 2, vocabulary=vocab) self.model.fit(training_text) def test_gamma(self): assert 0.1 == self.model.gamma def test_entropy_perplexity(self): text = [ ("<s>", "a"), ("a", "c"), ("c", "<UNK>"), ("<UNK>", "d"), ("d", "c"), ("c", "</s>"), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # <s>, a = 0.3929, -1.3479 # a, c = 0.0357, -4.8074 # c, UNK = 0.0(5), -4.1699 # UNK, d = 0.0263, -5.2479 # d, c = 0.0357, -4.8074 # c, </s> = 0.0(5), -4.1699 # TOTAL logscore: −24.5504 # - AVG logscore: 4.0917 H = 4.0917 perplexity = 17.0504 assert pytest.approx(self.model.entropy(text), 1e-4) == H assert pytest.approx(self.model.perplexity(text), 1e-4) == perplexity