Пример #1
0
class TestSequenceFunctions(unittest.TestCase):
    def setUp(self):
        self.lm = BigramLanguageModel(kUNK_CUTOFF, jm_lambda=0.6, \
                                      kn_discount = 0.1,
                                      kn_concentration = 1.0,
                                      dirichlet_alpha=0.1)

    def test_vocab(self):
        self.lm.train_seen("a", 300)

        self.lm.train_seen("b")
        self.lm.train_seen("c")
        self.lm.finalize()

        # Infrequent words should look the same
        self.assertEqual(self.lm.vocab_lookup("b"), self.lm.vocab_lookup("c"))

        # Infrequent words should look the same as never seen words
        self.assertEqual(self.lm.vocab_lookup("b"), self.lm.vocab_lookup("d"),
                         "")

        # The frequent word should be different from the infrequent word
        self.assertNotEqual(self.lm.vocab_lookup("a"),
                            self.lm.vocab_lookup("b"))

    def test_censor(self):
        self.lm.train_seen("a", 300)

        self.lm.train_seen("b")
        self.lm.train_seen("c")
        self.lm.finalize()

        censored_a = list(self.lm.tokenize_and_censor("a b d"))
        censored_b = list(self.lm.tokenize_and_censor("d b a"))
        censored_c = list(self.lm.tokenize_and_censor("a b d"))
        censored_d = list(self.lm.tokenize_and_censor("b d a"))

        self.assertEqual(censored_a, censored_c)
        self.assertEqual(censored_b, censored_d)

        # Should add start and end tag
        print("#".join(self.lm.tokenize("a b d")))
        print(censored_a)
        self.assertEqual(len(censored_a), 5)
        self.assertEqual(censored_a[0], censored_b[0])
        self.assertEqual(censored_a[-1], censored_b[-1])
        self.assertEqual(censored_a[1], censored_b[3])
        self.assertEqual(censored_a[2], censored_b[1])

    def test_lm(self):
        self.lm.train_seen("a", 300)
        self.lm.finalize()

        self.lm.add_train("a a b")

        # Test MLE
        word_start = self.lm.vocab_lookup(kSTART)
        word_end = self.lm.vocab_lookup(kEND)
        word_a = self.lm.vocab_lookup("a")
        word_b = self.lm.vocab_lookup("b")
        word_c = self.lm.vocab_lookup("c")

        self.assertAlmostEqual(self.lm.mle(word_start, word_b), kNEG_INF)
        self.assertAlmostEqual(self.lm.mle(word_start, word_a), lg(1.0))
        self.assertAlmostEqual(self.lm.mle(word_a, word_a), lg(0.5))
        self.assertAlmostEqual(self.lm.mle(word_a, word_b), lg(0.5))
        self.assertAlmostEqual(self.lm.mle(word_a, word_c), lg(0.5))

        # Test Add one
        self.assertAlmostEqual(self.lm.laplace(word_start, word_b),
                               lg(1.0 / 5.0))
        self.assertAlmostEqual(self.lm.laplace(word_start, word_a),
                               lg(2.0 / 5.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_a), lg(2.0 / 6.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_b), lg(2.0 / 6.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_c), lg(2.0 / 6.0))

        # Test Dirichlet
        self.assertAlmostEqual(self.lm.dirichlet(word_start, word_b),
                               lg(0.1 / 1.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_start, word_a),
                               lg(1.1 / 1.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_a),
                               lg(1.1 / 2.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_b),
                               lg(1.1 / 2.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_c),
                               lg(1.1 / 2.4))

        # Test Kneser-Ney
        self.assertAlmostEqual(self.lm.kneser_ney(word_start, word_a),
                               lg(0.69475))
        self.assertAlmostEqual(self.lm.kneser_ney(word_start, word_b),
                               lg(0.13475))
        self.assertAlmostEqual(self.lm.kneser_ney(word_start, word_end),
                               lg(0.13475))

        # Test Jelinek Mercer
        self.assertAlmostEqual(self.lm.jelinek_mercer(word_start, word_end),
                               lg(0.1))
        self.assertAlmostEqual(self.lm.jelinek_mercer(word_start, word_a),
                               lg(0.8))
Пример #2
0
 def setUp(self):
     self.lm = BigramLanguageModel(kUNK_CUTOFF, jm_lambda=0.6, \
                                   kn_discount = 0.1,
                                   kn_concentration = 1.0,
                                   dirichlet_alpha=0.1)
Пример #3
0
 def setUp(self):
     self.lm = BigramLanguageModel(kUNK_CUTOFF, jm_lambda=0.5, \
                                       dirichlet_alpha=0.1)
Пример #4
0
 def setUp(self):
     self.lm = BigramLanguageModel(kUNK_CUTOFF, jm_lambda=0.5, \
                                       dirichlet_alpha=0.1)
Пример #5
0
class TestSequenceFunctions(unittest.TestCase):

    def setUp(self):
        self.lm = BigramLanguageModel(kUNK_CUTOFF, jm_lambda=0.5, \
                                          dirichlet_alpha=0.1)

    def test_vocab(self):
        self.lm.train_seen("a", 300)

        self.lm.train_seen("b")
        self.lm.train_seen("c")
        self.lm.finalize()

        # Infrequent words should look the same
        self.assertEqual(self.lm.vocab_lookup("b"),
                         self.lm.vocab_lookup("c"))

        # Infrequent words should look the same as never seen words
        self.assertEqual(self.lm.vocab_lookup("b"),
                         self.lm.vocab_lookup("d"),
                         "")

        # The frequent word should be different from the infrequent word
        self.assertNotEqual(self.lm.vocab_lookup("a"),
                            self.lm.vocab_lookup("b"))

    def test_censor(self):
        self.lm.train_seen("a", 300)

        self.lm.train_seen("b")
        self.lm.train_seen("c")
        self.lm.finalize()

        censored_a = list(self.lm.tokenize_and_censor("a b d"))
        censored_b = list(self.lm.tokenize_and_censor("d b a"))
        censored_c = list(self.lm.tokenize_and_censor("a b d"))
        censored_d = list(self.lm.tokenize_and_censor("b d a"))

        self.assertEqual(censored_a, censored_c)
        self.assertEqual(censored_b, censored_d)

        # Should add start and end tag
        print("#".join(self.lm.tokenize("a b d")))
        print(censored_a)
        self.assertEqual(len(censored_a), 5)
        self.assertEqual(censored_a[0], censored_b[0])
        self.assertEqual(censored_a[-1], censored_b[-1])
        self.assertEqual(censored_a[1], censored_b[3])
        self.assertEqual(censored_a[2], censored_b[1])

    def test_lm(self):
        self.lm.train_seen("a", 300)
        self.lm.finalize()

        self.lm.add_train("a a b")

        # Test MLE
        word_start = self.lm.vocab_lookup(kSTART)
        word_end = self.lm.vocab_lookup(kEND)
        word_a = self.lm.vocab_lookup("a")
        word_b = self.lm.vocab_lookup("b")
        word_c = self.lm.vocab_lookup("c")

        self.assertAlmostEqual(self.lm.mle(word_start, word_b), kNEG_INF)
        self.assertAlmostEqual(self.lm.mle(word_start, word_a), lg(1.0))
        self.assertAlmostEqual(self.lm.mle(word_a, word_a), lg(0.5))
        self.assertAlmostEqual(self.lm.mle(word_a, word_b), lg(0.5))
        self.assertAlmostEqual(self.lm.mle(word_a, word_c), lg(0.5))

        # Test Add one
        self.assertAlmostEqual(self.lm.laplace(word_start, word_b),
                               lg(1.0 / 5.0))
        self.assertAlmostEqual(self.lm.laplace(word_start, word_a),
                               lg(2.0 / 5.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_a),
                               lg(2.0 / 6.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_b),
                               lg(2.0 / 6.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_c),
                               lg(2.0 / 6.0))

        # Test Dirichlet
        self.assertAlmostEqual(self.lm.dirichlet(word_start, word_b),
                               lg(0.1 / 1.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_start, word_a),
                               lg(1.1 / 1.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_a),
                               lg(1.1 / 2.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_b),
                               lg(1.1 / 2.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_c),
                               lg(1.1 / 2.4))

        # Test Kneser-Ney
        self.assertAlmostEqual(self.lm.kneser_ney(word_start, word_a),
                               lg(0.95))
        self.assertAlmostEqual(self.lm.kneser_ney(word_start, word_b),
                               lg(0.025))
        self.assertAlmostEqual(self.lm.kneser_ney(word_start, word_end),
                               lg(0.025))

        # Test Jelinek Mercer
        self.assertAlmostEqual(self.lm.jelinek_mercer(word_start, word_end),
                               lg(0.1))
        self.assertAlmostEqual(self.lm.jelinek_mercer(word_start, word_a),
                               lg(0.8))
Пример #6
0
class TestSequenceFunctions(unittest.TestCase):

    def setUp(self):
        self.lm = BigramLanguageModel(kUNK_CUTOFF, jm_lambda=0.5, \
                                          dirichlet_alpha=0.1)

    def test_vocab(self):
        self.lm.train_seen("a", 300)

        self.lm.train_seen("b")
        self.lm.train_seen("c")
        self.lm.finalize()

        # Infrequent words should look the same
        self.assertEqual(self.lm.vocab_lookup("b"),
                         self.lm.vocab_lookup("c"))

        # Infrequent words should look the same as never seen words
        self.assertEqual(self.lm.vocab_lookup("b"),
                         self.lm.vocab_lookup("d"),
                         "")

        # The frequent word should be different from the infrequent word
        self.assertNotEqual(self.lm.vocab_lookup("a"),
                            self.lm.vocab_lookup("b"))

    def test_censor(self):
        self.lm.train_seen("a", 300)

        self.lm.train_seen("b")
        self.lm.train_seen("c")
        self.lm.finalize()

        censored_a = list(self.lm.censor(["a", "b", "d"]))
        censored_b = list(self.lm.censor(["d", "b", "a"]))
        censored_c = list(self.lm.censor(["a", "d", "b"]))

        self.assertEqual(censored_a, censored_c)

        # Should add start and end tag
        self.assertEqual(len(censored_a), 5)
        self.assertEqual(censored_a[0], censored_b[0])
        self.assertEqual(censored_a[-1], censored_b[-1])
        self.assertEqual(censored_a[1], censored_b[3])
        self.assertEqual(censored_a[2], censored_b[2])
        self.assertEqual(censored_a[3], censored_c[3])

    def test_lm(self):
        self.lm.train_seen("a", 300)
        self.lm.finalize()

        self.lm.add_train(["a", "a", "b"])

        # Test MLE
        word_start = self.lm.vocab_lookup(kSTART)
        word_end = self.lm.vocab_lookup(kEND)
        word_a = self.lm.vocab_lookup("a")
        word_b = self.lm.vocab_lookup("b")
        word_c = self.lm.vocab_lookup("c")

        self.assertAlmostEqual(self.lm.mle(word_start, word_b), kNEG_INF)
        self.assertAlmostEqual(self.lm.mle(word_start, word_a), log(1.0))
        self.assertAlmostEqual(self.lm.mle(word_a, word_a), log(0.5))
        self.assertAlmostEqual(self.lm.mle(word_a, word_b), log(0.5))
        self.assertAlmostEqual(self.lm.mle(word_a, word_c), log(0.5))

        # Test Add one
        self.assertAlmostEqual(self.lm.laplace(word_start, word_b),
                               log(1.0 / 5.0))
        self.assertAlmostEqual(self.lm.laplace(word_start, word_a),
                               log(2.0 / 5.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_a),
                               log(2.0 / 6.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_b),
                               log(2.0 / 6.0))
        self.assertAlmostEqual(self.lm.laplace(word_a, word_c),
                               log(2.0 / 6.0))

        # Test Dirichlet
        self.assertAlmostEqual(self.lm.dirichlet(word_start, word_b),
                               log(0.1 / 1.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_start, word_a),
                               log(1.1 / 1.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_a),
                               log(1.1 / 2.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_b),
                               log(1.1 / 2.4))
        self.assertAlmostEqual(self.lm.dirichlet(word_a, word_c),
                               log(1.1 / 2.4))
Пример #7
0
 def setUp(self):
     self.lm = BigramLanguageModel(kUNK_CUTOFF, jm_lambda=0.6, \
                                   kn_discount = 0.1,
                                   kn_concentration = 1.0,
                                   dirichlet_alpha=0.1)