def load_model(path_to_saved_model: str) -> NGramTextGenerator: if not isinstance(path_to_saved_model, str): raise ValueError try: with open(path_to_saved_model, 'r') as file: model = json.load(file) word_storage = WordStorage() word_storage.storage = model['word_storage'] trie = NGramTrie(n_gram_size=int(model['n_gram_trie_size']), encoded_text=('he', ) * int(model['n_gram_trie_size'])) trie.n_grams = tuple([tuple(n_gram) for n_gram in model['n_grams']]) trie.n_gram_frequencies = { tuple(map(int, key.split(', '))): value for key, value in model['n_gram_trie_frequencies'].items() } trie.uni_grams = {(int(key), ): value for key, value in model['uni_grams'].items()} model_generator = NGramTextGenerator(word_storage, trie) return model_generator except FileNotFoundError as error: raise FileNotFoundError from error
def test_generate_text_large_context(self): """ should generate simple case with three sentences out of small corpus """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(5, encoded) generator = NGramTextGenerator(storage, trie) context = ( storage.get_id('i'), storage.get_id('have'), storage.get_id('a'), storage.get_id('bruno'), ) end = storage.get_id('<END>') actual = generator.generate_text(context, 3) self.assertEqual(actual.count(end), 3)
def test_decode_text_ideal(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('name'), storage.get_id('is'),) end = storage.get_id('<END>') generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) self.assertEqual(to_decode[-1], end) expected = ('Name is rex', 'Her name is rex') actual = decode_text(storage, to_decode) self.assertEqual(expected, actual)
def test_decode_text_incorrect_storage(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('name'), storage.get_id('is'),) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) bad_inputs = [(), [], 123, None, NGramTrie] for bad_storage in bad_inputs: self.assertRaises(ValueError, decode_text, bad_storage, to_decode)
def test_decode_text_ideal_conditions(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('name'), storage.get_id('is'),) generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) actual = decode_text(storage, to_decode) for sentence in actual: self.assertTrue('<END>' not in sentence) self.assertTrue(sentence[0].isupper()) self.assertTrue(sentence[-1].isalpha())
def test_likelihood_generator_instance_creation(self): """ Checks that class creates correct instance """ word_storage = WordStorage() ngram = NGramTrie(2, ()) generator = LikelihoodBasedTextGenerator(word_storage, ngram) self.assertEqual(generator._word_storage, word_storage) self.assertEqual(generator._n_gram_trie, ngram)
def test_ngram_text_generator_generate_next_word(self): """ Checks that next word generates properly """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno', 'cat', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = (word_storage.get_id('i'), word_storage.get_id('have')) expected = word_storage.get_id('a') actual = generator._generate_next_word(context) self.assertEqual(expected, actual)
def test_text_generator_throws_errors(self): """ throws errors with bad inputs """ bad_inputs = [[], {}, None, 9, 9.34, True] corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) generator = NGramTextGenerator(word_storage, trie) for bad_input in bad_inputs: self.assertRaises(ValueError, generator.generate_text, bad_input, 10)
def test_ngram_text_generator_generate_next_word_incorrect_context(self): """ Checks that method throws error """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) bad_inputs = [[], {}, (3, ), None, 9, 9.34, True] # (3, ) - it is incorrect sized ngram for bad_input in bad_inputs: self.assertRaises(ValueError, generator._generate_next_word, bad_input)
def test_ngram_text_generator_generate_sentence_properly(self): """ generates correct output according to simple case """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('i'), ) end = word_storage.get_id('<END>') generator = NGramTextGenerator(word_storage, trie) actual = generator._generate_sentence(context) self.assertEqual(actual[-1], end)
def test_length_of_sentence(self): """ generates sentences with length less than 20 """ corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i', 'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her', 'name', 'is', 'rex', 'too', 'he', 'funny', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('cat'), ) generator = NGramTextGenerator(word_storage, trie) actual = len(generator._generate_sentence(context)) expected = len( context) + 21 # cause we generate not more than 20 words + end self.assertLessEqual(actual, expected)
def test_ngram_text_generator_generate_sentence_no_end(self): """ should generate '<END>' anyway """ corpus = ('i', 'have', 'a', 'cat', 'his', 'name', 'is', 'bruno', 'i', 'have', 'a', 'dog', 'too', 'his', 'name', 'is', 'rex', 'her', 'name', 'is', 'rex', 'too', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) trie = NGramTrie(2, encoded) context = (word_storage.get_id('cat'), ) generator = NGramTextGenerator(word_storage, trie) actual = generator._generate_sentence(context) expected = '<END>' actual = word_storage.get_word(actual[-1]) self.assertEqual(expected, actual)
def test_generate_next_word_incorrect_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) bad_inputs = [[], {}, (2000, 1000, ), None, 9, 9.34, True] generator = LikelihoodBasedTextGenerator(storage, trie) for bad_context in bad_inputs: self.assertRaises(ValueError, generator._generate_next_word, bad_context)
def test_generate_next_word_larger_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(4, encoded) expected_word = storage.get_id('bruno') context = (storage.get_id('his'), storage.get_id('name'), storage.get_id('is')) generator = LikelihoodBasedTextGenerator(storage, trie) actual = generator._generate_next_word(context) self.assertEqual(expected_word, actual)
def test_calculate_likelihood_incorrect_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) bad_inputs = [[], {}, (2000, 1000, ), None, 9, 9.34, True] # (2000, 1000, ) -> context for three gram word = storage.get_id('dog') generator = LikelihoodBasedTextGenerator(storage, trie) for bad_context in bad_inputs: self.assertRaises(ValueError, generator._calculate_maximum_likelihood, word, bad_context)
def test_ngram_text_generator_generate_next_word_no_such_context(self): """ Checks that next word generates properly if no context found """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>') word_storage = WordStorage() word_storage.update(corpus) encoded = encode_text(word_storage, corpus) ngram = NGramTrie(3, encoded) generator = NGramTextGenerator(word_storage, ngram) context = ( word_storage.get_id('i'), word_storage.get_id('name'), ) # there is no such context in ngrams, so return most frequent option expected_top_freq = word_storage.get_id('<END>') # as it appears twice actual = generator._generate_next_word(context) self.assertEqual(expected_top_freq, actual)
def test_ngram_text_generator_generate_sentence_ideal(self): """ first and last generated words as expected """ corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('i'), ) first_generated = storage.get_id('have') last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) self.assertEqual(actual[1], first_generated) self.assertEqual(actual[-1], last_generated)
def test_calculate_likelihood_no_such_context(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) word = storage.get_id('dog') context = (storage.get_id('<END>'), storage.get_id('<END>'),) generator = LikelihoodBasedTextGenerator(storage, trie) expected = 0.0 actual = generator._calculate_maximum_likelihood(word, context) self.assertEqual(expected, actual)
def test_calculate_likelihood_incorrect_word(self): corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) bad_inputs = [(), [], None, 123] context = (storage.get_id('have'), storage.get_id('a'),) generator = LikelihoodBasedTextGenerator(storage, trie) for bad_word in bad_inputs: self.assertRaises(ValueError, generator._calculate_maximum_likelihood, bad_word, context)
def load_model(path_to_saved_model: str) -> NGramTextGenerator: if not isinstance(path_to_saved_model, str): raise ValueError with open(path_to_saved_model + '.json', 'r') as json_file: generator_json = json.load(json_file) words = WordStorage() words.storage = generator_json['_word_storage']['storage'] trie = NGramTrie(generator_json['_n_gram_trie']['size'], (0, 1)) trie.encoded_text = generator_json['_n_gram_trie']['encoded_text'] trie.n_grams = tuple( tuple(gram) for gram in generator_json['_n_gram_trie']['n_grams']) trie.n_gram_frequencies = { eval(key): value for key, value in generator_json['_n_gram_trie'] ['n_gram_frequencies'].items() } trie.uni_grams = { eval(key): value for key, value in generator_json['_n_gram_trie'] ['uni_grams'].items() } return NGramTextGenerator(words, trie)
""" Lab 4 """ from main import * from ngrams.ngram_trie import NGramTrie if __name__ == '__main__': corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(2, encoded) context = (storage.get_id('i'), ) first_generated = storage.get_id('have') last_generated = storage.get_id('<END>') generator = NGramTextGenerator(storage, trie) actual = generator._generate_sentence(context) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST RESULT = 0 if actual[1] == first_generated: RESULT = 1 assert RESULT == 1, ''
''' from ngrams.ngram_trie import NGramTrie from lab_4.main import encode_text, WordStorage, LikelihoodBasedTextGenerator, decode_text if __name__ == '__main__': corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = ( storage.get_id('name'), storage.get_id('is'), ) end = storage.get_id('<END>') generator = LikelihoodBasedTextGenerator(storage, trie) to_decode = generator.generate_text(context, 2) RESULT = decode_text(storage, to_decode) print(RESULT) assert RESULT == ('Name is rex', 'Her name is rex')
print("GENERATE WORD WITH LIKELIHOOD") corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is', 'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>', 'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'rex', 'too', '<END>') storage = WordStorage() storage.update(corpus) encoded = encode_text(storage, corpus) trie = NGramTrie(3, encoded) context = (storage.get_id('have'), storage.get_id('a'),) generator = LikelihoodBasedTextGenerator(storage, trie) generated_word = generator._generate_next_word(context) print(f"generated word in context {generated_word}") print("-------------------------------------------") print("BACKOFF GENERATOR") two = NGramTrie(2, encoded) trie = NGramTrie(3, encoded) context = (storage.get_id('name'),