def test_most_freq_word_end(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) five = NGramTrie(5, encoded) trie = NGramTrie(3, encoded) four = NGramTrie(4, encoded) expected_word = storage.get_id('<END>') context = ( storage.get_id('his'), storage.get_id('name'), storage.get_id('is'), storage.get_id('bruno'), ) generator = BackOffGenerator(storage, five, trie, four) actual = generator.most_freq_word(context) self.assertEqual(expected_word, actual)
def test_load_model_takes_less_time(self): with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file: big_data = big_file.read() tokenized_data = tokenize_by_sentence(big_data) storage = WordStorage() storage.update(tokenized_data) context = ( storage.get_id('despite'), storage.get_id('the'), ) start_time_generate = time() encoded = encode_text(storage, tokenized_data) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(storage, trie) generated_text = generator.generate_text(context, 3) end_time_generate = time() - start_time_generate save_model(generator, 'model_training') start_time_saved = time() loaded_model = load_model('model_training') new_result = loaded_model.generate_text(context, 3) end_time_saved = time() - start_time_saved self.assertGreater(end_time_generate, end_time_saved) self.assertEqual(generated_text, new_result)
def test_end(self): """ Checks that after decoding no end in result """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'cat', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('a'), storage.get_id('cat'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 1) actual = decode_text(storage, to_decode) expected = ('A cat', ) self.assertEqual(expected, actual)
def test_decode_text_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) print('Я ТЕСТ', context) end = storage.get_id('<END>') generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) self.assertEqual(to_decode[-1], end) expected = ('Name is rex', 'Her name is rex') actual = decode_text(storage, to_decode) self.assertEqual(expected, actual)
def test_decode_text_incorrect_storage(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) bad_inputs = [(), [], 123, None, NGramTrie] for bad_storage in bad_inputs: self.assertRaises(ValueError, decode_text, bad_storage, to_decode)
def test_generate_text_large_context(self): """ should generate simple case with three sentences out of small corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(5, encoded) generator = NGramTextGenerator(storage, trie) context = ( storage.get_id('i'), storage.get_id('have'), storage.get_id('a'), storage.get_id('bruno'), ) end = storage.get_id('<END>') actual = generator.generate_text(context, 3) self.assertEqual(actual.count(end), 3)
def test_decode_text_ideal_conditions(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) actual = decode_text(storage, to_decode) for sentence in actual: self.assertTrue('<END>' not in sentence) self.assertTrue(sentence[0].isupper()) self.assertTrue(sentence[-1].isalpha())
def test_generate_next_word_context_incorrect(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('rex') context = (storage.get_id('name'), storage.get_id('is'), storage.get_id('cat')) generator = BackOffGenerator(storage, four, two, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_text_generator_generate_sentence_proper_beginning(self): """ Checks that class creates correct sentence from a context '<END>' without '<END>' in the beginning """ corpus = ('my', 'favourite', 'subject', 'is', 'maths', '<END>', 'his', 'favourite', 'thing', 'is', 'music' '<END>', 'i', 'have', 'a', 'favourite', 'film', '<END>', 'my', 'family', 'likes', 'avatar', '<END>', 'my', 'favourite', 'subject', 'is', 'music', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('<END>'), ) first_generated = storage.get_id('my') last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertNotEqual(storage.get_id('<END>'), actual[0]) self.assertEqual(first_generated, actual[0]) self.assertEqual(last_generated, actual[-1])
def test_text_generator_no_context(self): """ checks if the program can generate sentences without given context """ corpus = ('cat', 'has', 'paws', '<END>', 'dogs', 'have', 'noses', '<END>', 'cat', 'has', 'whiskers', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) four = NGramTrie(4, encoded) context = ( storage.get_id('cat'), storage.get_id('dogs'), ) generator = BackOffGenerator(storage, trie, two, four) actual = generator.generate_text(context, 3) self.assertTrue(all(actual))
def main(): text = ('I have a cat. His name is Bruno. ' 'I have a dog too. His name is Rex. ' 'Her name is Rex too.') corpus = tokenize_by_sentence(text) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two) expected = 'rex' actual = storage.get_word(generator._generate_next_word(context)) print(f'TEXT:\n{text}') print(f'\nEXPECTED WORD AFTER name is IS {expected}') print(f'ACTUAL WORD AFTER name is IS {actual}') save_model(generator, 'model.txt') load_model('model.txt') return actual == expected
def realize_n_gram_text_generator(text): n_gram_storage = WordStorage() n_gram_storage.update(text) n_gram_context = (n_gram_storage.get_id('my'), n_gram_storage.get_id('dear')) n_gram_encoded = encode_text(n_gram_storage, text) n_gram_trie = NGramTrie(3, n_gram_encoded) n_gram_generator = NGramTextGenerator(n_gram_storage, n_gram_trie) n_gram_text_generated = n_gram_generator.generate_text(n_gram_context, 3) return decode_text(n_gram_storage, n_gram_text_generated)
def realize_likelihood_generator(text): likelihood_storage = WordStorage() likelihood_storage.update(text) context = (likelihood_storage.get_id('i'), likelihood_storage.get_id('shall'),) model = load_model('lab_4/likelihood_model.json') generator = LikelihoodBasedTextGenerator(model.word_storage, model.n_gram_trie) likelihood_text_generated = generator.generate_text(context, 3) return decode_text(likelihood_storage, likelihood_text_generated)
def realize_backoff_generator(text): backoff_storage = WordStorage() backoff_storage.update(text) backoff_encoded = encode_text(backoff_storage, text) two = NGramTrie(2, backoff_encoded) trie = NGramTrie(3, backoff_encoded) backoff_context = (backoff_storage.get_id('if'), backoff_storage.get_id('you'),) backoff_generator = BackOffGenerator(backoff_storage, trie, two) backoff_text_generated = backoff_generator.generate_text(backoff_context, 3) return decode_text(backoff_storage, backoff_text_generated)
def test_ngram_text_generator_identical_words(self): corpus = ('deadline', 'deadline', 'deadline', 'deadline', 'deadline', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('deadline'), storage.get_id('deadline')) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(20 + len(context) + 1, len(actual)) # +1 it is for <END>
def test_ngram_text_generator_generate_next_word(self): """ Checks that next word generates properly """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno', 'cat', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = (word_storage.get_id('i'), word_storage.get_id('have')) expected = word_storage.get_id('a') actual = generator._generate_next_word(context) self.assertEqual(expected, actual)
def test_ngram_text_generator_generate_sentence_properly(self): """ generates correct output according to simple case """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('i'), ) end = word_storage.get_id('<END>') generator = NGramTextGenerator(word_storage, trie) actual = generator._generate_sentence(context) self.assertEqual(actual[-1], end)
def test_get_most_frequent_gram_ideal(self): """ Checks that most frequent ngram gets properly """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno', 'cat', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = (word_storage.get_id('i'), word_storage.get_id('have')) expected = (word_storage.get_id('i'), word_storage.get_id('have'), word_storage.get_id('a')) actual = generator.get_most_frequent_gram(context) self.assertEqual(expected, actual)
def test_word_storage_get_id_ideal(self): """ ideal case for get_id """ word_storage = WordStorage() word_storage.storage = {'word': 1} expected = 1 actual = word_storage.get_id('word') self.assertEqual(expected, actual)
def test_decode_text_upper_first_letter(self): ''' Tests that number all the letters except first one in a sentence are in a lower case ''' corpus = ('first', 'sentence', 'here', '<END>', 'second', 'sentence', 'here', '<END>', 'third', 'sentence', 'here', '<END>') storage = WordStorage() storage.update(corpus) encoded_text = encode_text(storage, corpus) trie = NGramTrie(3, encoded_text) context = (storage.get_id('first'), storage.get_id('sentence')) likelihood_generator = LikelihoodBasedTextGenerator(storage, trie) generated_encoded_text = likelihood_generator.generate_text(context, 1) decoded_text = decode_text(storage, generated_encoded_text) self.assertFalse(decoded_text[0][1:].isupper())
def test_generate_next_word_short_context(self): corpus = ('bye', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) four = NGramTrie(4, encoded) trie = NGramTrie(3, encoded) two = NGramTrie(2, encoded) expected_word = storage.get_id('bye') context = (storage.get_id('<END>'),) generator = BackOffGenerator(storage, two, four, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_get_most_frequent_gram_no_such_context(self): """ Checks that returns empty tuple with no context in the corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = ( word_storage.get_id('i'), word_storage.get_id('name'), ) # there is no such context in ngrams expected = () actual = generator.get_most_frequent_gram(context) self.assertEqual(expected, actual)
def test_word_storage_put_word_ideal(self): """ word is added to storage """ word_storage = WordStorage() word = 'word' actual = word_storage._put_word(word) self.assertTrue(word in word_storage.storage) expected = word_storage.get_id(word) self.assertEqual(expected, actual)
def test_ngram_text_generator_generate_next_word_no_such_context(self): """ Checks that next word generates properly if no context found """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = ( word_storage.get_id('i'), word_storage.get_id('name'), ) # there is no such context in ngrams, so return most frequent option expected_top_freq = word_storage.get_id('<END>') # as it appears twice actual = generator._generate_next_word(context) self.assertEqual(expected_top_freq, actual)
def test_text_generator_generate_sentence_proper_number_of_end(self): """ Checks that class creates correct sentence with only one <END> """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'there', 'are', 'a', 'cat', 'outside', '<END>', 'here', 'is', 'a', 'cat', 'outside', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('a'), storage.get_id('is'), storage.get_id('<END>')) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(1, actual.count(storage.get_id('<END>')))
def test_ngram_text_generator_generate_sentence_ideal(self): """ first and last generated words as expected """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('i'), ) first_generated = storage.get_id('have') last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(actual[1], first_generated) self.assertEqual(actual[-1], last_generated)
def test_float_result(self): """ Checks that returned result is float """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('i'), ) word = storage.get_id('have') generator = LikelihoodBasedTextGenerator(storage, trie) actual = generator._calculate_maximum_likelihood(word, context) self.assertEqual(float, type(actual))
def test_generate_next_word_larger_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(4, encoded) expected_word = storage.get_id('bruno') context = (storage.get_id('his'), storage.get_id('name'), storage.get_id('is')) generator = LikelihoodBasedTextGenerator(storage, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_ngram_text_generator_duplicates_words(self): corpus = ('stop', 'it', 'stop', 'it', 'stop', 'it', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('stop'), ) generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(20 + len(context) + 1, len(actual))
def test_ngram_text_generator_end_at_the_beginning(self): """" should generate a sentence without <END> in any other position except the end of the sentence """ corpus = ('i', 'like', 'to', 'read', '<END>', 'he', 'likes', 'to', 'read', 'too', 'i', 'like', 'a', 'book', 'called', '"Harry Potter"', '<END>', 'he', 'likes', 'another', 'book', '<END>', 'he', 'does', 'not', 'tell', 'me', 'name', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('<END>'), ) last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(last_generated, actual[-1]) self.assertEqual(1, actual.count(storage.get_id('<END>')))