class LaplaceBigramTests(unittest.TestCase): """unit tests for Laplace class""" score_tests = [ # basic sanity-check: # count(d | c) = 1 # *count(d | c) = 2 # Count(w | c for w in vocab) = 1 # *Count(w | c for w in vocab) = 9 ("d", ["c"], 2.0 / 9), # Total unigrams: 14 # Vocab size: 8 # Denominator: 14 + 8 = 22 # count("a") = 2 # *count("a") = 3 ("a", None, 3.0 / 22), # in vocabulary but unseen # count("z") = 0 # *count("z") = 1 ("z", None, 1.0 / 22), # out of vocabulary should use "UNK" score # count("<UNK>") = 3 # *count("<UNK>") = 4 ("y", None, 4.0 / 22), ] def setUp(self): vocab, training_text = _prepare_test_data(2) self.model = Laplace(2, vocabulary=vocab) self.model.fit(training_text) def test_gamma(self): # Make sure the gamma is set to 1 self.assertEqual(1, self.model.gamma) def test_entropy_perplexity(self): text = [ ("<s>", "a"), ("a", "c"), ("c", "<UNK>"), ("<UNK>", "d"), ("d", "c"), ("c", "</s>"), ] # Unlike MLE this should be able to handle completely novel ngrams # Ngram = score, log score # <s>, a = 0.2, -2.3219 # a, c = 0.1, -3.3219 # c, UNK = 0.(1), -3.1699 # UNK, d = 0.(09), 3.4594 # d, c = 0.1 -3.3219 # c, </s> = 0.(1), -3.1699 # Total logscores: −18.7651 # - AVG logscores: 3.1275 H = 3.1275 perplexity = 8.7393 self.assertAlmostEqual(H, self.model.entropy(text), places=4) self.assertAlmostEqual(perplexity, self.model.perplexity(text), places=4)
def compute_pp(self, n, tokenized_train, tokenized_test): train_data, padded_sents = padded_everygram_pipeline( n, tokenized_train) test_data, padded_sents = padded_everygram_pipeline(n, tokenized_test) model = Laplace(1) model.fit(train_data, padded_sents) s = 0 for i, test in enumerate(test_data): p = model.perplexity(test) s += p perplexity = s / (i + 1) return perplexity
def vary_ngram(train_corpus, test_corpus, n_gram_orders): ''' Use the nltk.lm.Laplace for training. Returns a dictionary of perplexity values at different order n-gram LMs :param train_corpus: list of list of str, corpus to train language model on. :param test_corpus: list of list of str, corpus to test language model on. :n_gram_orders: list of ints, orders of n-grams desired. :returns: a dictionary of perplexities at different orders, key=order, value=perplexity. :rtype: dict. Hint: Follow the same LM training procedure as in the notebook in the end of Exercise 1. ''' test = sum([['<s>'] + x + ['</s>'] for x in test_corpus], []) ret = {} for order in n_gram_orders: train, vocab = padded_everygram_pipeline(order, train_corpus) lm = Laplace(order) lm.fit(train, vocab) ret[order] = lm.perplexity(test) return ret