def test_count_2gram(self):
        ngram = InterpolatedNGram(2, self.sents, gamma=1.0)

        counts = {
            (): 12,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            ('.',): 2,
            ('</s>',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
            ('<s>', 'el'): 1,
            ('el', 'gato'): 1,
            ('gato', 'come'): 1,
            ('come', 'pescado'): 1,
            ('pescado', '.'): 1,
            ('.', '</s>'): 2,
            ('<s>', 'la'): 1,
            ('la', 'gata'): 1,
            ('gata', 'come'): 1,
            ('come', 'salmón'): 1,
            ('salmón', '.'): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(ngram.count(gram), c, gram)
    def test_cond_prob_1gram_no_addone(self):
        model = InterpolatedNGram(1, self.sents, gamma=1.0, addone=False)

        # behaves just like unsmoothed n-gram
        probs = {
            'pescado': 1 / 12.0,
            'come': 2 / 12.0,
            'salame': 0.0,
        }
        for token, p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token), p, msg=token)
    def test_cond_prob_2gram_no_addone(self):
        gamma = 1.0
        model = InterpolatedNGram(2, self.sents, gamma, addone=False)

        c1 = 2.0  # count for 'come' (and '.')
        l1 = c1 / (c1 + gamma)

        probs = {
            ('pescado', 'come'): l1 * 0.5 + (1.0 - l1) * 1 / 12.0,
            ('salmón', 'come'): l1 * 0.5 + (1.0 - l1) * 1 / 12.0,
            ('salame', 'come'): 0.0,
            ('</s>', '.'): l1 * 1.0 + (1.0 - l1) * 2 / 12.0,
        }
        for (token, prev), p in probs.items():
            self.assertAlmostEqual(model.cond_prob(token, [prev]), p, msg=token)
    def test_held_out(self):
        model = InterpolatedNGram(1, self.sents)

        # only first sentence (second sentence is held-out data)
        counts = {
            (): 6,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 1,
            ('pescado',): 1,
            ('.',): 1,
            ('</s>',): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(model.count(gram), c, gram)
    def test_count_1gram(self):
        model = InterpolatedNGram(1, self.sents, gamma=1.0)

        counts = {
            (): 12,
            ('el',): 1,
            ('gato',): 1,
            ('come',): 2,
            ('pescado',): 1,
            ('.',): 2,
            ('</s>',): 2,
            ('la',): 1,
            ('gata',): 1,
            ('salmón',): 1,
        }
        for gram, c in counts.items():
            self.assertEqual(model.count(gram), c, gram)
示例#6
0
    def test_norm_1gram(self):
        models = [
            InterpolatedNGram(1, self.sents, gamma=1.0, addone=False),
            InterpolatedNGram(1, self.sents, gamma=5.0, addone=False),
            InterpolatedNGram(1, self.sents, gamma=10.0, addone=False),
            InterpolatedNGram(1, self.sents, gamma=50.0, addone=False),
            InterpolatedNGram(1, self.sents, gamma=100.0, addone=False),
            InterpolatedNGram(1, self.sents, gamma=1.0, addone=True),
            InterpolatedNGram(1, self.sents, gamma=5.0, addone=True),
            InterpolatedNGram(1, self.sents, gamma=10.0, addone=True),
            InterpolatedNGram(1, self.sents, gamma=50.0, addone=True),
            InterpolatedNGram(1, self.sents, gamma=100.0, addone=True),
        ]

        tokens = {
            'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón',
            '</s>'
        }

        for model in models:
            prob_sum = sum(model.cond_prob(token) for token in tokens)
            # prob_sum < 1.0 or almost equal to 1.0:
            self.assertAlmostLessEqual(prob_sum, 1.0)
示例#7
0
    tokenizer = RegexpTokenizer(pattern)

    root = '.'
    corpus = PlaintextCorpusReader(root, 'books\.txt', word_tokenizer=tokenizer)

    sents = corpus.sents()

    # train the model
    n = int(opts['-n'])

    if opts['-m'] == 'addone':
        model = AddOneNGram(n, sents)
    elif opts['-m'] == 'inter':
        gamma = opts['-g']
        if gamma is None:
            model = InterpolatedNGram(n, sents, None, False)
        else:
            model = InterpolatedNGram(n, sents, gamma, False)
    elif opts['-m'] == 'interaddone':
        gamma = opts['-g']
        if gamma is None:
            model = InterpolatedNGram(n, sents, None, True)
        else:
            model = InterpolatedNGram(n, sents, gamma, True)
    else:
        model = NGram(n, sents)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    pickle.dump(model, f)
示例#8
0
                                   word_tokenizer=tokenizer,
                                   sent_tokenizer=sent_tokenizer)
    # sents will be a tokens' list of the corpus
    sents = corpus.sents()

    # train the model
    type_model = opts['-m']
    n = int(opts['-n'])
    if type_model == 'ngram':
        model = NGram(n, sents)
        print(str(n) + '-gram will be ready')
    elif type_model == 'addone':
        model = AddOneNGram(n, sents)
        print(str(n) + '-addone will be ready')
    elif type_model == 'interpolated':
        model = InterpolatedNGram(n, sents)
        print(str(n) + '-interpolated will be ready')
    elif type_model == 'backoff':
        model = BackOffNGram(n, sents)
        print(str(n) + '-backoff will be ready')
    else:
        print('modelo erroneo')
        exit(0)

    # save it
    filename = opts['-o']
    f = open(filename, 'wb')
    # to load a object pickle.load(file)
    # dump save the object in bytes
    pickle.dump(model, f)
    f.close()
示例#9
0
    # order of the model
    n = int(opts['-n'])
    # model type
    m = str(opts['-m'])
    filename = opts['-o']

    # train the model
    if m == "ngram":
        print("NGram Model selected")
        model = NGram(n, sents)
    elif m == "addone":
        print("AddOne NGram Model selected")
        model = AddOneNGram(n, sents)
    elif m == "interpolated":
        print("Interpolated NGram Model selected")
        model = InterpolatedNGram(n, sents, addone=True)
    elif m == "backoff":
        print("BackOff NGram Model selected")
        model = BackOffNGram(n, sents, addone=True)
    else:
        print("Bad Model Type")
        print(help())
        exit()

    print("n: %d\nOutput file: %s\n" % (n, filename))
    # save it
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()