def test_cond_prob_normalization_2gram_no_addone(self): model = BackOffNGram(2, self.sents, beta=0.5, addone=False) alpha = 1.0 - (1.0 - 0.5) / 1.0 denom = model.denom(('el',)) self.assertEqual(model.alpha(('el',)), alpha) probs = { ('gato', 'el'): (1.0 - 0.5) / 1.0, # back-off to the unigrams: ('el', 'el'): alpha * 1.0 / (12.0 * denom), ('come', 'el'): alpha * 2.0 / (12.0 * denom), ('pescado', 'el'): alpha * 1.0 / (12.0 * denom), ('.', 'el'): alpha * 2.0 / (12.0 * denom), ('</s>', 'el'): alpha * 2.0 / (12.0 * denom), ('la', 'el'): alpha * 1.0 / (12.0 * denom), ('gata', 'el'): alpha * 1.0 / (12.0 * denom), ('salmón', 'el'): alpha * 1.0 / (12.0 * denom), } for (token, prev), p in probs.items(): self.assertEqual(model.cond_prob(token, [prev]), p, (token, prev)) # the sum is one: prob_sum = sum(probs.values()) self.assertTrue(abs(prob_sum - 1.0) < 1e-10, prob_sum)
def test_count_2gram(self): models = [ # same test for different values of beta and addone: BackOffNGram(2, self.sents, beta=0.5), BackOffNGram(2, self.sents, beta=0.5, addone=False), BackOffNGram(2, self.sents, beta=0.0), BackOffNGram(2, self.sents, beta=0.0, addone=False), ] counts = { (): 12, ('el', ): 1, ('gato', ): 1, ('come', ): 2, ('pescado', ): 1, ('.', ): 2, ('</s>', ): 2, ('la', ): 1, ('gata', ): 1, ('salmón', ): 1, ('<s>', 'el'): 1, ('el', 'gato'): 1, ('gato', 'come'): 1, ('come', 'pescado'): 1, ('pescado', '.'): 1, ('.', '</s>'): 2, ('<s>', 'la'): 1, ('la', 'gata'): 1, ('gata', 'come'): 1, ('come', 'salmón'): 1, ('salmón', '.'): 1, } for model in models: for gram, c in counts.items(): self.assertEqual(model.count(gram), c)
def test_cond_prob_normalization_2gram_no_addone(self): model = BackOffNGram(2, self.sents, beta=0.5, addone=False) alpha = 1.0 - (1.0 - 0.5) / 1.0 denom = model.denom(('el', )) self.assertAlmostEqual(model.alpha(('el', )), alpha) probs = { ('gato', 'el'): (1.0 - 0.5) / 1.0, # back-off to the unigrams: ('el', 'el'): alpha * 1.0 / (12.0 * denom), ('come', 'el'): alpha * 2.0 / (12.0 * denom), ('pescado', 'el'): alpha * 1.0 / (12.0 * denom), ('.', 'el'): alpha * 2.0 / (12.0 * denom), ('</s>', 'el'): alpha * 2.0 / (12.0 * denom), ('la', 'el'): alpha * 1.0 / (12.0 * denom), ('gata', 'el'): alpha * 1.0 / (12.0 * denom), ('salmón', 'el'): alpha * 1.0 / (12.0 * denom), } for (token, prev), p in probs.items(): self.assertAlmostEqual(model.cond_prob(token, (prev, )), p, msg=(token, prev)) # the sum is one: prob_sum = sum(probs.values()) self.assertAlmostEqual(prob_sum, 1.0)
def test_norm_3gram(self): models = [ BackOffNGram(3, self.sents, beta=0.0, addone=False), BackOffNGram(3, self.sents, beta=0.5, addone=False), BackOffNGram(3, self.sents, beta=0.0, addone=True), BackOffNGram(3, self.sents, beta=0.5, addone=True), ] tokens = { 'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>' } prev_tokens = { 'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '<s>' } prevs = [('<s>', '<s>')] + \ [('<s>', t) for t in prev_tokens] + \ [(t1, t2) for t1 in prev_tokens for t2 in prev_tokens] for model in models: for prev in prevs: prob_sum = sum( model.cond_prob(token, tuple(prev)) for token in tokens) # prob_sum < 1.0 or almost equal to 1.0: self.assertAlmostLessEqual(prob_sum, 1.0, msg=prev)
def test_cond_prob_2gram_no_discount_no_addone(self): model = BackOffNGram(2, self.sents, beta=0.0, addone=False) probs = { ('pescado', 'come'): 1.0 / 2.0, ('salmón', 'come'): 1.0 / 2.0, ('salame', 'come'): 0.0, # back-off to the unigram that is 0.0 } for (token, prev), p in probs.items(): self.assertAlmostEqual(model.cond_prob(token, (prev,)), p, msg=(token))
def test_cond_prob_2gram_no_discount_no_addone(self): model = BackOffNGram(2, self.sents, beta=0.0, addone=False) probs = { ('pescado', 'come'): 1.0 / 2.0, ('salmón', 'come'): 1.0 / 2.0, ('salame', 'come'): 0.0, # back-off to the unigram that is 0.0 } for (token, prev), p in probs.items(): self.assertEqual(model.cond_prob(token, [prev]), p, (token))
def test_cond_prob_1gram_no_addone(self): model = BackOffNGram(1, self.sents, beta=0.5, addone=False) # behaves just like unsmoothed n-gram probs = { 'pescado': 1 / 12.0, 'come': 2 / 12.0, 'salame': 0.0, } for token, p in probs.items(): self.assertAlmostEqual(model.cond_prob(token), p, msg=token)
def test_cond_prob_1gram_no_addone(self): model = BackOffNGram(1, self.sents, beta=0.5, addone=False) # behaves just like unsmoothed n-gram probs = { 'pescado': 1 / 12.0, 'come': 2 / 12.0, 'salame': 0.0, } for token, p in probs.items(): self.assertEqual(model.cond_prob(token), p)
def test_norm_1gram(self): models = [ BackOffNGram(1, self.sents, beta=0.0, addone=False), BackOffNGram(1, self.sents, beta=0.5, addone=False), BackOffNGram(1, self.sents, beta=0.0, addone=True), BackOffNGram(1, self.sents, beta=0.5, addone=True), ] tokens = ['el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'] for model in models: prob_sum = sum(model.cond_prob(token) for token in tokens) # prob_sum < 1.0 or almost equal to 1.0: self.assertAlmostLessEqual(prob_sum, 1.0)
def test_held_out(self): model = BackOffNGram(1, self.sents) # only first sentence (second sentence is held-out data) counts = { (): 6, ('el', ): 1, ('gato', ): 1, ('come', ): 1, ('pescado', ): 1, ('.', ): 1, ('</s>', ): 1, } for gram, c in counts.items(): self.assertEqual(model.count(gram), c, gram)
def test_held_out(self): model = BackOffNGram(1, self.sents) # only first sentence (second sentence is held-out data) counts = { (): 6, ('el',): 1, ('gato',): 1, ('come',): 1, ('pescado',): 1, ('.',): 1, ('</s>',): 1, } for gram, c in counts.items(): self.assertEqual(model.count(gram), c, gram)
def test_norm_2gram(self): models = [ BackOffNGram(2, self.sents, beta=0.0, addone=False), BackOffNGram(2, self.sents, beta=0.5, addone=False), BackOffNGram(2, self.sents, beta=0.0, addone=True), BackOffNGram(2, self.sents, beta=0.5, addone=True), ] tokens = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '</s>'} prevs = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón', '<s>'} for model in models: for prev in prevs: prob_sum = sum(model.cond_prob(token, (prev,)) for token in tokens) # prob_sum < 1.0 or almost equal to 1.0: self.assertAlmostLessEqual(prob_sum, 1.0, msg=prev)
# order of the model n = int(opts['-n']) # model type m = str(opts['-m']) filename = opts['-o'] # train the model if m == "ngram": print("NGram Model selected") model = NGram(n, sents) elif m == "addone": print("AddOne NGram Model selected") model = AddOneNGram(n, sents) elif m == "interpolated": print("Interpolated NGram Model selected") model = InterpolatedNGram(n, sents, addone=True) elif m == "backoff": print("BackOff NGram Model selected") model = BackOffNGram(n, sents, addone=True) else: print("Bad Model Type") print(help()) exit() print("n: %d\nOutput file: %s\n" % (n, filename)) # save it f = open(filename, 'wb') pickle.dump(model, f) f.close()
def test_init_2gram(self): model = BackOffNGram(2, self.sents, beta=0.5) A = { ('<s>', ): {'el', 'la'}, ('el', ): {'gato'}, ('gato', ): {'come'}, ('come', ): {'pescado', 'salmón'}, ('pescado', ): {'.'}, ('.', ): {'</s>'}, ('la', ): {'gata'}, ('gata', ): {'come'}, ('salmón', ): {'.'}, } for tokens, Aset in A.items(): self.assertEqual(model.A(tokens), Aset, tokens) # missing probability mass alpha = { ('<s>', ): 2 * 0.5 / 2, ('el', ): 1 * 0.5 / 1, ('gato', ): 1 * 0.5 / 1, ('come', ): 2 * 0.5 / 2, ('pescado', ): 1 * 0.5 / 1, ('.', ): 1 * 0.5 / 2, ('la', ): 1 * 0.5 / 1, ('gata', ): 1 * 0.5 / 1, ('salmón', ): 1 * 0.5 / 1, } for tokens, a in alpha.items(): self.assertAlmostEqual(model.alpha(tokens), a, msg=tokens) # normalization factor denom = { ('<s>', ): 1.0 - model.cond_prob('el') - model.cond_prob('la'), ('el', ): 1.0 - model.cond_prob('gato'), ('gato', ): 1.0 - model.cond_prob('come'), ('come', ): 1.0 - model.cond_prob('pescado') - model.cond_prob('salmón'), ('pescado', ): 1.0 - model.cond_prob('.'), ('.', ): 1.0 - model.cond_prob('</s>'), ('la', ): 1.0 - model.cond_prob('gata'), ('gata', ): 1.0 - model.cond_prob('come'), ('salmón', ): 1.0 - model.cond_prob('.'), } for tokens, d in denom.items(): self.assertAlmostEqual(model.denom(tokens), d, msg=tokens)
def test_init_2gram(self): model = BackOffNGram(2, self.sents, beta=0.5) A = { ('<s>',): {'el', 'la'}, ('el',): {'gato'}, ('gato',): {'come'}, ('come',): {'pescado', 'salmón'}, ('pescado',): {'.'}, ('.',): {'</s>'}, ('la',): {'gata'}, ('gata',): {'come'}, ('salmón',): {'.'}, } for tokens, Aset in A.items(): self.assertEqual(model.A(tokens), Aset, tokens) # missing probability mass alpha = { ('<s>',): 2 * 0.5 / 2, ('el',): 1 * 0.5 / 1, ('gato',): 1 * 0.5 / 1, ('come',): 2 * 0.5 / 2, ('pescado',): 1 * 0.5 / 1, ('.',): 1 * 0.5 / 2, ('la',): 1 * 0.5 / 1, ('gata',): 1 * 0.5 / 1, ('salmón',): 1 * 0.5 / 1, } for tokens, a in alpha.items(): self.assertEqual(model.alpha(tokens), a, tokens) # normalization factor denom = { ('<s>',): 1.0 - model.cond_prob('el') - model.cond_prob('la'), ('el',): 1.0 - model.cond_prob('gato'), ('gato',): 1.0 - model.cond_prob('come'), ('come',): 1.0 - model.cond_prob('pescado') - model.cond_prob('salmón'), ('pescado',): 1.0 - model.cond_prob('.'), ('.',): 1.0 - model.cond_prob('</s>'), ('la',): 1.0 - model.cond_prob('gata'), ('gata',): 1.0 - model.cond_prob('come'), ('salmón',): 1.0 - model.cond_prob('.'), } for tokens, d in denom.items(): self.assertEqual(model.denom(tokens), d, tokens)
sent_tokenizer=sent_tokenizer) # sents will be a tokens' list of the corpus sents = corpus.sents() # train the model type_model = opts['-m'] n = int(opts['-n']) if type_model == 'ngram': model = NGram(n, sents) print(str(n) + '-gram will be ready') elif type_model == 'addone': model = AddOneNGram(n, sents) print(str(n) + '-addone will be ready') elif type_model == 'interpolated': model = InterpolatedNGram(n, sents) print(str(n) + '-interpolated will be ready') elif type_model == 'backoff': model = BackOffNGram(n, sents) print(str(n) + '-backoff will be ready') else: print('modelo erroneo') exit(0) # save it filename = opts['-o'] f = open(filename, 'wb') # to load a object pickle.load(file) # dump save the object in bytes pickle.dump(model, f) f.close()
import pickle from docopt import docopt from corpus.ancora import AncoraCorpusReader from languagemodeling.ngram import BackOffNGram if __name__ == '__main__': opts = docopt(__doc__) n = int(opts['-n']) path = '/home/alangb/Escritorio/ancora-3.0.1es/' corpus = AncoraCorpusReader(path) sents = list(corpus.sents()) # split words with "_" (underscore) checked_sents = [] for i, sent in enumerate(sents): checked_sents.append([]) for word in sent: if '_' in word: new_words = word.split('_') for new_word in new_words: checked_sents[i].append(new_word) else: checked_sents[i].append(word) # build model model = BackOffNGram(n, checked_sents) # # save it filename = opts['-o'] f = open(filename, 'wb') pickle.dump(model, f) f.close()