def test_generate_next_word_context_incorrect(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('rex') context = (storage.get_id('name'), storage.get_id('is'), storage.get_id('cat')) generator = BackOffGenerator(storage, four, two, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_generate_text_large_context(self): """ should generate simple case with three sentences out of small corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(5, encoded) generator = NGramTextGenerator(storage, trie) context = ( storage.get_id('i'), storage.get_id('have'), storage.get_id('a'), storage.get_id('bruno'), ) end = storage.get_id('<END>') actual = generator.generate_text(context, 3) self.assertEqual(actual.count(end), 3)
def test_decode_text_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) print('Я ТЕСТ', context) end = storage.get_id('<END>') generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) self.assertEqual(to_decode[-1], end) expected = ('Name is rex', 'Her name is rex') actual = decode_text(storage, to_decode) self.assertEqual(expected, actual)
def test_text_generator_generate_sentence_proper_beginning(self): """ Checks that class creates correct sentence from a context '<END>' without '<END>' in the beginning """ corpus = ('my', 'favourite', 'subject', 'is', 'maths', '<END>', 'his', 'favourite', 'thing', 'is', 'music' '<END>', 'i', 'have', 'a', 'favourite', 'film', '<END>', 'my', 'family', 'likes', 'avatar', '<END>', 'my', 'favourite', 'subject', 'is', 'music', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('<END>'), ) first_generated = storage.get_id('my') last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertNotEqual(storage.get_id('<END>'), actual[0]) self.assertEqual(first_generated, actual[0]) self.assertEqual(last_generated, actual[-1])
def test_decode_text_incorrect_storage(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) bad_inputs = [(), [], 123, None, NGramTrie] for bad_storage in bad_inputs: self.assertRaises(ValueError, decode_text, bad_storage, to_decode)
def test_end(self): """ Checks that after decoding no end in result """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'cat', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('a'), storage.get_id('cat'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 1) actual = decode_text(storage, to_decode) expected = ('A cat', ) self.assertEqual(expected, actual)
def test_most_freq_word_incorrect_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) four = NGramTrie(4, encoded) bad_inputs = [[], {}, ( 2000, 1000, ), None, 9, 9.34, True] generator = BackOffGenerator(storage, trie, two, four) for bad_context in bad_inputs: self.assertRaises(ValueError, generator.most_freq_word, bad_context)
def test_decode_text_ideal_conditions(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) actual = decode_text(storage, to_decode) for sentence in actual: self.assertTrue('<END>' not in sentence) self.assertTrue(sentence[0].isupper()) self.assertTrue(sentence[-1].isalpha())
def test_text_generator_no_context(self): """ checks if the program can generate sentences without given context """ corpus = ('cat', 'has', 'paws', '<END>', 'dogs', 'have', 'noses', '<END>', 'cat', 'has', 'whiskers', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) four = NGramTrie(4, encoded) context = ( storage.get_id('cat'), storage.get_id('dogs'), ) generator = BackOffGenerator(storage, trie, two, four) actual = generator.generate_text(context, 3) self.assertTrue(all(actual))
def main(): text = ('I have a cat. His name is Bruno. ' 'I have a dog too. His name is Rex. ' 'Her name is Rex too.') corpus = tokenize_by_sentence(text) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two) expected = 'rex' actual = storage.get_word(generator._generate_next_word(context)) print(f'TEXT:\n{text}') print(f'\nEXPECTED WORD AFTER name is IS {expected}') print(f'ACTUAL WORD AFTER name is IS {actual}') save_model(generator, 'model.txt') load_model('model.txt') return actual == expected
def test_load_model_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) save_model(generator, 'my_awesome_model') loaded_model = load_model('my_awesome_model') self.assertEqual(generator._n_gram_trie.n_grams, loaded_model._n_gram_trie.n_grams) self.assertEqual(len(generator._n_gram_trie.n_gram_frequencies), len(loaded_model._n_gram_trie.n_gram_frequencies)) for ngram, frequency in generator._n_gram_trie.n_gram_frequencies.items( ): self.assertTrue( ngram in loaded_model._n_gram_trie.n_gram_frequencies) self.assertEqual( frequency, loaded_model._n_gram_trie.n_gram_frequencies[ngram]) self.assertEqual(len(generator._word_storage.storage), len(loaded_model._word_storage.storage)) for word, id_num in generator._word_storage.storage.items(): self.assertTrue(word in loaded_model._word_storage.storage) self.assertEqual(id_num, loaded_model._word_storage.storage[word])
def test_load_model_takes_less_time(self): with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file: big_data = big_file.read() tokenized_data = tokenize_by_sentence(big_data) storage = WordStorage() storage.update(tokenized_data) context = ( storage.get_id('despite'), storage.get_id('the'), ) start_time_generate = time() encoded = encode_text(storage, tokenized_data) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(storage, trie) generated_text = generator.generate_text(context, 3) end_time_generate = time() - start_time_generate save_model(generator, 'model_training') start_time_saved = time() loaded_model = load_model('model_training') new_result = loaded_model.generate_text(context, 3) end_time_saved = time() - start_time_saved self.assertGreater(end_time_generate, end_time_saved) self.assertEqual(generated_text, new_result)
def test_most_freq_word_end(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) five = NGramTrie(5, encoded) trie = NGramTrie(3, encoded) four = NGramTrie(4, encoded) expected_word = storage.get_id('<END>') context = ( storage.get_id('his'), storage.get_id('name'), storage.get_id('is'), storage.get_id('bruno'), ) generator = BackOffGenerator(storage, five, trie, four) actual = generator.most_freq_word(context) self.assertEqual(expected_word, actual)
def test_word_storage_update_calls_required_function(self, mock): """ ideal case for update calling _put_word method """ word_storage = WordStorage() sentences = ('i', 'have', 'a', 'cat', '<END>') word_storage.update(sentences) self.assertTrue(mock.called)
def test_word_storage_update_empty(self): """ ideal case for update """ word_storage = WordStorage() sentences = () word_storage.update(sentences) self.assertEqual(word_storage.storage, {})
def test_word_storage_update_duplicates(self): """ ideal case for update """ word_storage = WordStorage() sentences = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'cat', '<END>') word_storage.update(sentences) self.assertEqual(len(word_storage.storage), 5)
def realize_n_gram_text_generator(text): n_gram_storage = WordStorage() n_gram_storage.update(text) n_gram_context = (n_gram_storage.get_id('my'), n_gram_storage.get_id('dear')) n_gram_encoded = encode_text(n_gram_storage, text) n_gram_trie = NGramTrie(3, n_gram_encoded) n_gram_generator = NGramTextGenerator(n_gram_storage, n_gram_trie) n_gram_text_generated = n_gram_generator.generate_text(n_gram_context, 3) return decode_text(n_gram_storage, n_gram_text_generated)
def test_word_storage_update_ideal(self): """ ideal case for update """ word_storage = WordStorage() corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage.update(corpus) self.assertEqual(len(word_storage.storage), 9)
def test_empty_to_decode(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) bad_decode = () self.assertRaises(ValueError, decode_text, storage, bad_decode)
def realize_likelihood_generator(text): likelihood_storage = WordStorage() likelihood_storage.update(text) context = (likelihood_storage.get_id('i'), likelihood_storage.get_id('shall'),) model = load_model('lab_4/likelihood_model.json') generator = LikelihoodBasedTextGenerator(model.word_storage, model.n_gram_trie) likelihood_text_generated = generator.generate_text(context, 3) return decode_text(likelihood_storage, likelihood_text_generated)
def test_decode_text_empty_sentence(self): """ Tests that decode_corpus function can handle empty sentence input """ word_storage = WordStorage() corpus = () expected = () word_storage.update(corpus) actual = decode_text(word_storage, corpus) self.assertEqual(expected, actual)
def realize_backoff_generator(text): backoff_storage = WordStorage() backoff_storage.update(text) backoff_encoded = encode_text(backoff_storage, text) two = NGramTrie(2, backoff_encoded) trie = NGramTrie(3, backoff_encoded) backoff_context = (backoff_storage.get_id('if'), backoff_storage.get_id('you'),) backoff_generator = BackOffGenerator(backoff_storage, trie, two) backoff_text_generated = backoff_generator.generate_text(backoff_context, 3) return decode_text(backoff_storage, backoff_text_generated)
def test_decode_text_incorrect_sentences(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) bad_inputs = [[], 123, None, NGramTrie] for bad_decode in bad_inputs: self.assertRaises(ValueError, decode_text, storage, bad_decode)
def test_ngram_text_generator_duplicates_words(self): corpus = ('stop', 'it', 'stop', 'it', 'stop', 'it', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('stop'), ) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(20 + len(context) + 1, len(actual))
def test_save_model_incorrect_path(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(storage, trie) self.assertRaises(FileNotFoundError, save_model, generator, r'some_folder/some_file')
def test_decode_text_incorrect_numbers_to_decode(self): """ the program raises ValueError if numbers to decode are negative or float. """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') storage = WordStorage() storage.update(corpus) to_decode = ((-1, 3), (3.3, 3, 4, 5)) for tuple_to_decode in to_decode: self.assertRaises(ValueError, decode_text, storage, tuple_to_decode)
def test_context_end(self): """ checks if <END> is in the context """ context = ('cat', '<END>') corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, trie) self.assertRaises(ValueError, generator._generate_sentence, context)
def test_ngram_text_generator_identical_words(self): corpus = ('deadline', 'deadline', 'deadline', 'deadline', 'deadline', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('deadline'), storage.get_id('deadline')) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(20 + len(context) + 1, len(actual)) # +1 it is for <END>
def test_encode_text_same_words_count(self): """ Tests that encode_text function can assign correct id to the same words """ word_storage = WordStorage() corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'cat', '<END>') word_storage.update(corpus) actual = encode_text(word_storage, corpus) self.assertEqual(actual[:5], actual[5:])
def test_ngram_text_generator_generate_next_word(self): """ Checks that next word generates properly """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno', 'cat', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = (word_storage.get_id('i'), word_storage.get_id('have')) expected = word_storage.get_id('a') actual = generator._generate_next_word(context) self.assertEqual(expected, actual)