示例#1
0
    def test_cond_prob_normalization_2gram_no_addone(self):
        model = BackOffNGram(2, self.sents, beta=0.5, addone=False)

        alpha = 1.0 - (1.0 - 0.5) / 1.0
        denom = model.denom(('el',))
        self.assertEqual(model.alpha(('el',)), alpha)

        probs = {
            ('gato', 'el'): (1.0 - 0.5) / 1.0,
            # back-off to the unigrams:
            ('el', 'el'): alpha * 1.0 / (12.0 * denom),
            ('come', 'el'): alpha * 2.0 / (12.0 * denom),
            ('pescado', 'el'): alpha * 1.0 / (12.0 * denom),
            ('.', 'el'): alpha * 2.0 / (12.0 * denom),
            ('</s>', 'el'): alpha * 2.0 / (12.0 * denom),
            ('la', 'el'): alpha * 1.0 / (12.0 * denom),
            ('gata', 'el'): alpha * 1.0 / (12.0 * denom),
            ('salmón', 'el'): alpha * 1.0 / (12.0 * denom),
        }
        for (token, prev), p in probs.items():
            self.assertEqual(model.cond_prob(token, [prev]), p, (token, prev))

        # the sum is one:
        prob_sum = sum(probs.values())
        self.assertTrue(abs(prob_sum - 1.0) < 1e-10, prob_sum)
    def test_count_2gram(self):
        models = [
            # same test for different values of beta and addone:
            BackOffNGram(2, self.sents, beta=0.5),
            BackOffNGram(2, self.sents, beta=0.5, addone=False),
            BackOffNGram(2, self.sents, beta=0.0),
            BackOffNGram(2, self.sents, beta=0.0, addone=False),
        ]

        counts = {
            (): 12,
            ('el', ): 1,
            ('gato', ): 1,
            ('come', ): 2,
            ('pescado', ): 1,
            ('.', ): 2,
            ('</s>', ): 2,
            ('la', ): 1,
            ('gata', ): 1,
            ('salmón', ): 1,
            ('<s>', 'el'): 1,
            ('el', 'gato'): 1,
            ('gato', 'come'): 1,
            ('come', 'pescado'): 1,
            ('pescado', '.'): 1,
            ('.', '</s>'): 2,
            ('<s>', 'la'): 1,
            ('la', 'gata'): 1,
            ('gata', 'come'): 1,
            ('come', 'salmón'): 1,
            ('salmón', '.'): 1,
        }
        for model in models:
            for gram, c in counts.items():
                self.assertEqual(model.count(gram), c)
    def test_cond_prob_normalization_2gram_no_addone(self):
        model = BackOffNGram(2, self.sents, beta=0.5, addone=False)

        alpha = 1.0 - (1.0 - 0.5) / 1.0
        denom = model.denom(('el', ))
        self.assertAlmostEqual(model.alpha(('el', )), alpha)

        probs = {
            ('gato', 'el'): (1.0 - 0.5) / 1.0,
            # back-off to the unigrams:
            ('el', 'el'): alpha * 1.0 / (12.0 * denom),
            ('come', 'el'): alpha * 2.0 / (12.0 * denom),
            ('pescado', 'el'): alpha * 1.0 / (12.0 * denom),
            ('.', 'el'): alpha * 2.0 / (12.0 * denom),
            ('</s>', 'el'): alpha * 2.0 / (12.0 * denom),
            ('la', 'el'): alpha * 1.0 / (12.0 * denom),
            ('gata', 'el'): alpha * 1.0 / (12.0 * denom),
            ('salmón', 'el'): alpha * 1.0 / (12.0 * denom),
        }
        for (token, prev), p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token, (prev, )),
                                   p,
                                   msg=(token, prev))

        # the sum is one:
        prob_sum = sum(probs.values())
        self.assertAlmostEqual(prob_sum, 1.0)
    def test_norm_3gram(self):
        models = [
            BackOffNGram(3, self.sents, beta=0.0, addone=False),
            BackOffNGram(3, self.sents, beta=0.5, addone=False),
            BackOffNGram(3, self.sents, beta=0.0, addone=True),
            BackOffNGram(3, self.sents, beta=0.5, addone=True),
        ]

        tokens = {
            'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón',
            '</s>'
        }
        prev_tokens = {
            'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '<s>'
        }
        prevs = [('<s>', '<s>')] + \
            [('<s>', t) for t in prev_tokens] + \
            [(t1, t2) for t1 in prev_tokens for t2 in prev_tokens]

        for model in models:
            for prev in prevs:
                prob_sum = sum(
                    model.cond_prob(token, tuple(prev)) for token in tokens)
                # prob_sum < 1.0 or almost equal to 1.0:
                self.assertAlmostLessEqual(prob_sum, 1.0, msg=prev)
示例#5
0
    def test_cond_prob_2gram_no_discount_no_addone(self):
        model = BackOffNGram(2, self.sents, beta=0.0, addone=False)

        probs = {
            ('pescado', 'come'): 1.0 / 2.0,
            ('salmón', 'come'): 1.0 / 2.0,
            ('salame', 'come'): 0.0,  # back-off to the unigram that is 0.0
        }
        for (token, prev), p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token, (prev,)), p, msg=(token))
示例#6
0
    def test_cond_prob_2gram_no_discount_no_addone(self):
        model = BackOffNGram(2, self.sents, beta=0.0, addone=False)

        probs = {
            ('pescado', 'come'): 1.0 / 2.0,
            ('salmón', 'come'): 1.0 / 2.0,
            ('salame', 'come'): 0.0,  # back-off to the unigram that is 0.0
        }
        for (token, prev), p in probs.items():
            self.assertEqual(model.cond_prob(token, [prev]), p, (token))
    def test_cond_prob_1gram_no_addone(self):
        model = BackOffNGram(1, self.sents, beta=0.5, addone=False)

        # behaves just like unsmoothed n-gram
        probs = {
            'pescado': 1 / 12.0,
            'come': 2 / 12.0,
            'salame': 0.0,
        }
        for token, p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token), p, msg=token)
示例#8
0
    def test_cond_prob_1gram_no_addone(self):
        model = BackOffNGram(1, self.sents, beta=0.5, addone=False)

        # behaves just like unsmoothed n-gram
        probs = {
            'pescado': 1 / 12.0,
            'come': 2 / 12.0,
            'salame': 0.0,
        }
        for token, p in probs.items():
            self.assertEqual(model.cond_prob(token), p)
示例#9
0
    def test_norm_1gram(self):
        models = [
            BackOffNGram(1, self.sents, beta=0.0, addone=False),
            BackOffNGram(1, self.sents, beta=0.5, addone=False),
            BackOffNGram(1, self.sents, beta=0.0, addone=True),
            BackOffNGram(1, self.sents, beta=0.5, addone=True),
        ]

        tokens = ['el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>']

        for model in models:
            prob_sum = sum(model.cond_prob(token) for token in tokens)
            # prob_sum < 1.0 or almost equal to 1.0:
            self.assertAlmostLessEqual(prob_sum, 1.0)
    def test_held_out(self):
        model = BackOffNGram(1, self.sents)

        # only first sentence (second sentence is held-out data)
        counts = {
            (): 6,
            ('el', ): 1,
            ('gato', ): 1,
            ('come', ): 1,
            ('pescado', ): 1,
            ('.', ): 1,
            ('</s>', ): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(model.count(gram), c, gram)
示例#11
0
    def test_held_out(self):
        model = BackOffNGram(1, self.sents)

        # only first sentence (second sentence is held-out data)
        counts = {
            (): 6,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 1,
            ('pescado',): 1,
            ('.',): 1,
            ('</s>',): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(model.count(gram), c, gram)
示例#12
0
    def test_norm_2gram(self):
        models = [
            BackOffNGram(2, self.sents, beta=0.0, addone=False),
            BackOffNGram(2, self.sents, beta=0.5, addone=False),
            BackOffNGram(2, self.sents, beta=0.0, addone=True),
            BackOffNGram(2, self.sents, beta=0.5, addone=True),
        ]

        tokens = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'}
        prevs = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '<s>'}

        for model in models:
            for prev in prevs:
                prob_sum = sum(model.cond_prob(token, (prev,)) for token in tokens)
                # prob_sum < 1.0 or almost equal to 1.0:
                self.assertAlmostLessEqual(prob_sum, 1.0, msg=prev)
示例#13
0
    # order of the model
    n = int(opts['-n'])
    # model type
    m = str(opts['-m'])
    filename = opts['-o']

    # train the model
    if m == "ngram":
        print("NGram Model selected")
        model = NGram(n, sents)
    elif m == "addone":
        print("AddOne NGram Model selected")
        model = AddOneNGram(n, sents)
    elif m == "interpolated":
        print("Interpolated NGram Model selected")
        model = InterpolatedNGram(n, sents, addone=True)
    elif m == "backoff":
        print("BackOff NGram Model selected")
        model = BackOffNGram(n, sents, addone=True)
    else:
        print("Bad Model Type")
        print(help())
        exit()

    print("n: %d\nOutput file: %s\n" % (n, filename))
    # save it
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
    def test_init_2gram(self):
        model = BackOffNGram(2, self.sents, beta=0.5)

        A = {
            ('<s>', ): {'el', 'la'},
            ('el', ): {'gato'},
            ('gato', ): {'come'},
            ('come', ): {'pescado', 'salmón'},
            ('pescado', ): {'.'},
            ('.', ): {'</s>'},
            ('la', ): {'gata'},
            ('gata', ): {'come'},
            ('salmón', ): {'.'},
        }
        for tokens, Aset in A.items():
            self.assertEqual(model.A(tokens), Aset, tokens)

        # missing probability mass
        alpha = {
            ('<s>', ): 2 * 0.5 / 2,
            ('el', ): 1 * 0.5 / 1,
            ('gato', ): 1 * 0.5 / 1,
            ('come', ): 2 * 0.5 / 2,
            ('pescado', ): 1 * 0.5 / 1,
            ('.', ): 1 * 0.5 / 2,
            ('la', ): 1 * 0.5 / 1,
            ('gata', ): 1 * 0.5 / 1,
            ('salmón', ): 1 * 0.5 / 1,
        }
        for tokens, a in alpha.items():
            self.assertAlmostEqual(model.alpha(tokens), a, msg=tokens)

        # normalization factor
        denom = {
            ('<s>', ):
            1.0 - model.cond_prob('el') - model.cond_prob('la'),
            ('el', ):
            1.0 - model.cond_prob('gato'),
            ('gato', ):
            1.0 - model.cond_prob('come'),
            ('come', ):
            1.0 - model.cond_prob('pescado') - model.cond_prob('salmón'),
            ('pescado', ):
            1.0 - model.cond_prob('.'),
            ('.', ):
            1.0 - model.cond_prob('</s>'),
            ('la', ):
            1.0 - model.cond_prob('gata'),
            ('gata', ):
            1.0 - model.cond_prob('come'),
            ('salmón', ):
            1.0 - model.cond_prob('.'),
        }
        for tokens, d in denom.items():
            self.assertAlmostEqual(model.denom(tokens), d, msg=tokens)
示例#15
0
    def test_init_2gram(self):
        model = BackOffNGram(2, self.sents, beta=0.5)

        A = {
            ('<s>',): {'el', 'la'},
            ('el',): {'gato'},
            ('gato',): {'come'},
            ('come',): {'pescado', 'salmón'},
            ('pescado',): {'.'},
            ('.',): {'</s>'},
            ('la',): {'gata'},
            ('gata',): {'come'},
            ('salmón',): {'.'},
        }
        for tokens, Aset in A.items():
            self.assertEqual(model.A(tokens), Aset, tokens)

        # missing probability mass
        alpha = {
            ('<s>',): 2 * 0.5 / 2,
            ('el',): 1 * 0.5 / 1,
            ('gato',): 1 * 0.5 / 1,
            ('come',): 2 * 0.5 / 2,
            ('pescado',): 1 * 0.5 / 1,
            ('.',): 1 * 0.5 / 2,
            ('la',): 1 * 0.5 / 1,
            ('gata',): 1 * 0.5 / 1,
            ('salmón',): 1 * 0.5 / 1,
        }
        for tokens, a in alpha.items():
            self.assertEqual(model.alpha(tokens), a, tokens)

        # normalization factor
        denom = {
            ('<s>',): 1.0 - model.cond_prob('el') - model.cond_prob('la'),
            ('el',): 1.0 - model.cond_prob('gato'),
            ('gato',): 1.0 - model.cond_prob('come'),
            ('come',): 1.0 - model.cond_prob('pescado') - model.cond_prob('salmón'),
            ('pescado',): 1.0 - model.cond_prob('.'),
            ('.',): 1.0 - model.cond_prob('</s>'),
            ('la',): 1.0 - model.cond_prob('gata'),
            ('gata',): 1.0 - model.cond_prob('come'),
            ('salmón',): 1.0 - model.cond_prob('.'),
        }
        for tokens, d in denom.items():
            self.assertEqual(model.denom(tokens), d, tokens)
示例#16
0
                                   sent_tokenizer=sent_tokenizer)
    # sents will be a tokens' list of the corpus
    sents = corpus.sents()

    # train the model
    type_model = opts['-m']
    n = int(opts['-n'])
    if type_model == 'ngram':
        model = NGram(n, sents)
        print(str(n) + '-gram will be ready')
    elif type_model == 'addone':
        model = AddOneNGram(n, sents)
        print(str(n) + '-addone will be ready')
    elif type_model == 'interpolated':
        model = InterpolatedNGram(n, sents)
        print(str(n) + '-interpolated will be ready')
    elif type_model == 'backoff':
        model = BackOffNGram(n, sents)
        print(str(n) + '-backoff will be ready')
    else:
        print('modelo erroneo')
        exit(0)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    # to load a object pickle.load(file)
    # dump save the object in bytes
    pickle.dump(model, f)
    f.close()
示例#17
0
import pickle
from docopt import docopt
from corpus.ancora import AncoraCorpusReader
from languagemodeling.ngram import BackOffNGram

if __name__ == '__main__':
    opts = docopt(__doc__)
    n = int(opts['-n'])
    path = '/home/alangb/Escritorio/ancora-3.0.1es/'
    corpus = AncoraCorpusReader(path)
    sents = list(corpus.sents())
    # split words with "_" (underscore)
    checked_sents = []
    for i, sent in enumerate(sents):
        checked_sents.append([])
        for word in sent:
            if '_' in word:
                new_words = word.split('_')
                for new_word in new_words:
                    checked_sents[i].append(new_word)
            else:
                checked_sents[i].append(word)
    # build model
    model = BackOffNGram(n, checked_sents)
    # # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()