def main(): text = ('I have a cat. His name is Bruno. ' 'I have a dog too. His name is Rex. ' 'Her name is Rex too.') corpus = tokenize_by_sentence(text) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) generator = BackOffGenerator(storage, trie, two) expected = 'rex' actual = storage.get_word(generator._generate_next_word(context)) print(f'TEXT:\n{text}') print(f'\nEXPECTED WORD AFTER name is IS {expected}') print(f'ACTUAL WORD AFTER name is IS {actual}') save_model(generator, 'model.txt') load_model('model.txt') return actual == expected
def test_load_model_takes_less_time(self): with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file: big_data = big_file.read() tokenized_data = tokenize_by_sentence(big_data) storage = WordStorage() storage.update(tokenized_data) context = ( storage.get_id('despite'), storage.get_id('the'), ) start_time_generate = time() encoded = encode_text(storage, tokenized_data) trie = NGramTrie(3, encoded) generator = NGramTextGenerator(storage, trie) generated_text = generator.generate_text(context, 3) end_time_generate = time() - start_time_generate save_model(generator, 'model_training') start_time_saved = time() loaded_model = load_model('model_training') new_result = loaded_model.generate_text(context, 3) end_time_saved = time() - start_time_saved self.assertGreater(end_time_generate, end_time_saved) self.assertEqual(generated_text, new_result)
def test_tokenize_by_sentence_empty_sentence(self): """ Tests that tokenize_by_sentence function can handle empty sentence input """ text = '' expected = () actual = tokenize_by_sentence(text) self.assertEqual(expected, actual)
def test_tokenize_by_sentence_inappropriate_sentence(self): """ Tests that tokenize_by_sentence function can handle inappropriate sentence input """ text = '$#&*@#$*#@)' expected = () actual = tokenize_by_sentence(text) self.assertEqual(expected, actual)
def test_tokenize_by_sentence_ideal(self): """ Tests that tokenize_by_sentence function can handle ideal two sentence input """ text = 'I have a cat.\nHis name is Bruno' expected = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') actual = tokenize_by_sentence(text) self.assertEqual(expected, actual)
def test_tokenize_by_sentence_complex(self): """ Tests that tokenize_by_sentence function can handle complex split case """ text = 'Mar#y wa$nted, to swim. However, she was afraid of sharks.' expected = ('mary', 'wanted', 'to', 'swim', '<END>', 'however', 'she', 'was', 'afraid', 'of', 'sharks', '<END>') actual = tokenize_by_sentence(text) self.assertEqual(expected, actual)
def test_tokenize_by_sentence_dirty_text(self): """ Tests that tokenize_by_sentence function can handle text filled with inappropriate characters """ text = 'The first% sentence><. The sec&*ond sent@ence #.' expected = ('the', 'first', 'sentence', '<END>', 'the', 'second', 'sentence', '<END>') actual = tokenize_by_sentence(text) self.assertEqual(expected, actual)
def test_tokenize_by_sentence_punctuation_marks(self): """ Tests that tokenize_by_sentence function can process and ignore different punctuation marks """ text = 'The, first sentence - nice? The second sentence: bad!' expected = ('the', 'first', 'sentence', 'nice', '<END>', 'the', 'second', 'sentence', 'bad', '<END>') actual = tokenize_by_sentence(text) self.assertEqual(expected, actual)
def test_tokenize_by_sentence_adds_ends(self): ''' Tests that number of "<END>" corresponds the number of sentences ''' text = '''There are many big and small libraries everywhere in our country. They have millions of books in different languages. You can find there the oldest and the newest books.''' expected_end_num = 3 actual_end_num = tokenize_by_sentence(text).count('<END>') self.assertEqual(expected_end_num, actual_end_num)
def test_tokenize_text_lower_case(self): ''' Tests that tokens in encoded text are all in a lower case (except "<END>") ''' text = '''There are many big and small libraries everywhere in our country. They have millions of books in different languages. You can find there the oldest and the newest books.''' tokenized_text = tokenize_by_sentence(text) actual = True for token in tokenized_text: if token != '<END>' and token.isupper(): actual = False self.assertTrue(actual)
""" Text generator implementation starter """ from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator from lab_4.main import tokenize_by_sentence, encode_text if __name__ == '__main__': # here goes your function calls first_text = open('lab_3/Frank_Baum.txt', encoding="utf-8") first_text_tokenized = tokenize_by_sentence(first_text.read()) word_storage = WordStorage() word_storage.update(first_text_tokenized) encoded = encode_text(word_storage, first_text_tokenized) n_gram_trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, n_gram_trie) RESULT = generator.generate_text(encoded[16:17], 3) #print(RESULT) assert RESULT, "Not working"
""" Generator of the text starter """ from lab_4.main import tokenize_by_sentence from lab_4.main import WordStorage from lab_4.ngrams.ngram_trie import NGramTrie from lab_4.main import encode_text from lab_4.main import NGramTextGenerator if __name__ == '__main__': text = tokenize_by_sentence( """Hi everyone! Nice to meet you again. What are you doing in my laboratory work? You are very nice person, do you know it? To be honest, I can't stand programming. But it doesn't depend on you! It's my personal problem and I don't know how to solve it... It doesn't matter right now""") word_storage = WordStorage() word_storage.update(text) encoded_text = encode_text(word_storage, text) n_gram_trie = NGramTrie(3, encoded_text) generator_of_text = NGramTextGenerator(word_storage, n_gram_trie) context = word_storage.get_id('on'), word_storage.get_id('you') formed_ids = generator_of_text.generate_text(context, 1) formed_text = []
""" Text generator implementation starter """ from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator from lab_4.main import tokenize_by_sentence, encode_text if __name__ == '__main__': text = 'I have a cat. His name is Bruno. I have a dog. Her name is Rex. Her name is Rex too.' text_tokenized = tokenize_by_sentence(text) word_storage = WordStorage() word_storage.update(text_tokenized) encoded = encode_text(word_storage, text_tokenized) trie = NGramTrie(2, encoded) context = (word_storage.get_id('i'), ) generator = NGramTextGenerator(word_storage, trie) RESULT = generator.generate_text(context, 4) print(RESULT) assert RESULT, "Not working"
from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': TEXT = 'I like flowers.\nMy mom likes flowers too.\nHer favourite flower is rose.\nMy favourite flower is rose too.' corpus = tokenize_by_sentence(TEXT) word_storage = WordStorage() word_storage.update(corpus) encoded_corpus = encode_text(word_storage, corpus) ngrams = NGramTrie(2, encoded_corpus) text_generator = NGramTextGenerator(word_storage, ngrams) gen_text = text_generator.generate_text((1, ), 2) end = word_storage.get_id('<END>') actual = gen_text.count(end) RESULT = 2 print(actual) assert RESULT == actual, 'not working'
from lab_4.ngrams.ngram_trie import NGramTrie from lab_4.main import tokenize_by_sentence from lab_4.main import WordStorage from lab_4.main import encode_text from lab_4.main import NGramTextGenerator if __name__ == '__main__': text = "This is a dog. It likes running. This is a cat. It likes sleeping. Everyone likes sleeping too." text_in_tokens = tokenize_by_sentence(text) word_storage = WordStorage() word_storage.update(text_in_tokens) encoded_text = encode_text(word_storage, text_in_tokens) n_gram_trie = NGramTrie(2, encoded_text) context = (word_storage.get_id('likes'),) text_generator = NGramTextGenerator(word_storage, n_gram_trie) RESULT = text_generator.generate_text(context, 4) print(RESULT) assert RESULT, "Someting went worng.."
from lab_4 import main text = main.tokenize_by_sentence('I have a dog. It is name is Bruno.') storage = main.WordStorage() text_encoded = main.encode_text(storage, text) trie = main.NGramTrie(2, text_encoded) text_generator = main.NGramTextGenerator(storage, trie) new_text = text_generator.generate_text((1,), 1) end = storage.get_id('<END>') actual = new_text.count(end) RESULT = 1 assert RESULT == actual, 'not generating'
""" Text generator """ from lab_4.ngrams.ngram_trie import NGramTrie from lab_4.main import tokenize_by_sentence from lab_4.main import WordStorage, LikelihoodBasedTextGenerator from lab_4.main import encode_text, decode_text from lab_4.main import NGramTextGenerator if __name__ == '__main__': TEXT = 'I have a cat. His name is Bruno. I have a dog too. His name is Rex. Her name is Rex too.' tokenized_text = tokenize_by_sentence(TEXT) word_storage = WordStorage() word_storage.update(tokenized_text) encoded = encode_text(word_storage, tokenized_text) trie = NGramTrie(3, encoded) context = ( word_storage.get_id('name'), word_storage.get_id('is'), ) generator = NGramTextGenerator(word_storage, trie) generated_text = generator.generate_text(context, 2) gen_likelihood = LikelihoodBasedTextGenerator(word_storage, trie) gen_text = gen_likelihood.generate_text(context, 2) decoded_text = decode_text(word_storage, gen_text)
""" Text generator implementation starter """ from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator from lab_4.main import tokenize_by_sentence, encode_text if __name__ == '__main__': TEXT = 'I have a cat. His name is Bruno. I have a dog. Her name is Rex. Her name is Rex too.' text_tokenized = tokenize_by_sentence(TEXT) word_storage = WordStorage() word_storage.update(text_tokenized) encoded = encode_text(word_storage, text_tokenized) trie = NGramTrie(2, encoded) context = (word_storage.get_id('i'), ) generator = NGramTextGenerator(word_storage, trie) RESULT = generator.generate_text(context, 4) print(RESULT) assert RESULT, "Not working"
""" Lab 4 implementation starter """ from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': text = 'I have a dog.\nHis name is Will' tokenize_text = tokenize_by_sentence(text) print(tokenize_text) storage = WordStorage() storage.update(tokenize_text) print(storage) encode = encode_text(storage, tokenize_text) print(encode) n_gram_trie = NGramTrie(2, encode) print(n_gram_trie) generator = NGramTextGenerator(storage, n_gram_trie) context = (storage.get_id('a'), ) print(context) RESULT = generator.generate_text(context, 3) print(RESULT) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST assert RESULT, 'Not working'
from lab_4 import main if __name__ == '__main__': text = 'I have a cat.\nHis name is Bruno' corpus = main.tokenize_by_sentence(text) print(corpus) storage = main.WordStorage() storage.update(corpus) print(storage.storage) encoded = main.encode_text(storage, corpus) print(encoded) RESULT = encoded assert RESULT == (1, 2, 3, 4, 5, 6, 7, 8, 9, 5), 'Not working'
""" Lab 4 implementation starter """ from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text, tokenize_by_sentence from lab_4.ngrams.ngram_trie import NGramTrie if __name__ == '__main__': with open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') as file_frank: corpus = tokenize_by_sentence(file_frank.read()) storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) four = NGramTrie(4, encoded) context = ( storage.get_id('when'), storage.get_id('the'), ) generator = BackOffGenerator(storage, four, trie) generated_text = generator.generate_text(context, 5) RESULT = decode_text(storage, generated_text) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST assert RESULT, 'Encoding not working'
""" Text generator """ from lab_4.ngrams.ngram_trie import NGramTrie from lab_4.main import tokenize_by_sentence from lab_4.main import WordStorage from lab_4.main import encode_text from lab_4.main import NGramTextGenerator if __name__ == '__main__': text = 'I have a cat. His name is Bruno. I have a dog too. ' \ 'His name is Rex. Her name is Rex too' corpus = tokenize_by_sentence(text) word_storage = WordStorage() word_storage.update(corpus) encoded_text = encode_text(word_storage, corpus) n_gram_trie = NGramTrie(2, encoded_text) n_gram_text_generator = NGramTextGenerator(word_storage, n_gram_trie) context = (word_storage.get_id('i'), word_storage.get_id('have')) text_generated = n_gram_text_generator.generate_text(context, 2) output_text = [] for word_id in text_generated: word = word_storage.get_word(word_id)