def test_generate_next_word_complex(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) four = NGramTrie(4, encoded) expected_word = storage.get_id('rex') context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two, four) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def main(): text = ('I have a cat. His name is Bruno. ' 'I have a dog too. His name is Rex. ' 'Her name is Rex too.') corpus = tokenize_by_sentence(text) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two) expected = 'rex' actual = storage.get_word(generator._generate_next_word(context)) print(f'TEXT:\n{text}') print(f'\nEXPECTED WORD AFTER name is IS {expected}') print(f'ACTUAL WORD AFTER name is IS {actual}') save_model(generator, 'model.txt') load_model('model.txt') return actual == expected
def test_generate_next_word_short_context(self): corpus = ('bye', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('bye') context = (storage.get_id('<END>'),) generator = BackOffGenerator(storage, two, four, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_generate_next_word_no_context(self): corpus = ('i', 'watch', 'a', 'horror', 'movie', '<END>', 'would', 'you', 'like', 'to', 'watch' 'with', 'me', '<END>', 'i', 'do', 'not', 'like', 'such', 'films', '<END>', 'i', 'like', 'to', 'watch', 'drama', 'movies', '<END>', 'bye') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('<END>') context = (storage.get_id('bye'),) generator = BackOffGenerator(storage, two, four, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)