Пример #1
0
    def test_most_freq_word_end(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        five = NGramTrie(5, encoded)
        trie = NGramTrie(3, encoded)
        four = NGramTrie(4, encoded)

        expected_word = storage.get_id('<END>')
        context = (
            storage.get_id('his'),
            storage.get_id('name'),
            storage.get_id('is'),
            storage.get_id('bruno'),
        )

        generator = BackOffGenerator(storage, five, trie, four)

        actual = generator.most_freq_word(context)
        self.assertEqual(expected_word, actual)
    def test_load_model_takes_less_time(self):
        with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file:
            big_data = big_file.read()
        tokenized_data = tokenize_by_sentence(big_data)
        storage = WordStorage()
        storage.update(tokenized_data)
        context = (
            storage.get_id('despite'),
            storage.get_id('the'),
        )

        start_time_generate = time()
        encoded = encode_text(storage, tokenized_data)
        trie = NGramTrie(3, encoded)
        generator = NGramTextGenerator(storage, trie)
        generated_text = generator.generate_text(context, 3)
        end_time_generate = time() - start_time_generate
        save_model(generator, 'model_training')

        start_time_saved = time()
        loaded_model = load_model('model_training')
        new_result = loaded_model.generate_text(context, 3)
        end_time_saved = time() - start_time_saved

        self.assertGreater(end_time_generate, end_time_saved)
        self.assertEqual(generated_text, new_result)
    def test_end(self):
        """
             Checks that after decoding no end in result
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'cat', '<END>', 'his',
                  'name', 'is', 'rex', '<END>', 'her', 'name', 'is', 'cat',
                  '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        context = (
            storage.get_id('a'),
            storage.get_id('cat'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 1)
        actual = decode_text(storage, to_decode)
        expected = ('A cat', )
        self.assertEqual(expected, actual)
Пример #4
0
    def test_decode_text_ideal(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (
            storage.get_id('name'),
            storage.get_id('is'),
        )
        print('Я ТЕСТ', context)
        end = storage.get_id('<END>')

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)
        self.assertEqual(to_decode[-1], end)

        expected = ('Name is rex', 'Her name is rex')
        actual = decode_text(storage, to_decode)
        self.assertEqual(expected, actual)
Пример #5
0
    def test_decode_text_incorrect_storage(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (
            storage.get_id('name'),
            storage.get_id('is'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)

        bad_inputs = [(), [], 123, None, NGramTrie]

        for bad_storage in bad_inputs:
            self.assertRaises(ValueError, decode_text, bad_storage, to_decode)
Пример #6
0
    def test_generate_text_large_context(self):
        """
        should generate simple case with three sentences out of small corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(5, encoded)

        generator = NGramTextGenerator(storage, trie)

        context = (
            storage.get_id('i'),
            storage.get_id('have'),
            storage.get_id('a'),
            storage.get_id('bruno'),
        )
        end = storage.get_id('<END>')
        actual = generator.generate_text(context, 3)
        self.assertEqual(actual.count(end), 3)
Пример #7
0
    def test_decode_text_ideal_conditions(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)

        context = (
            storage.get_id('name'),
            storage.get_id('is'),
        )

        generator = LikelihoodBasedTextGenerator(storage, trie)

        to_decode = generator.generate_text(context, 2)
        actual = decode_text(storage, to_decode)

        for sentence in actual:
            self.assertTrue('<END>' not in sentence)
            self.assertTrue(sentence[0].isupper())
            self.assertTrue(sentence[-1].isalpha())
Пример #8
0
    def test_generate_next_word_context_incorrect(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>',
                  'his', 'name', 'is', 'bruno', '<END>',
                  'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>',
                  'her', 'name', 'is', 'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        four = NGramTrie(4, encoded)
        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)

        expected_word = storage.get_id('rex')
        context = (storage.get_id('name'),
                   storage.get_id('is'),
                   storage.get_id('cat'))

        generator = BackOffGenerator(storage, four, two, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
Пример #9
0
    def test_text_generator_generate_sentence_proper_beginning(self):
        """
        Checks that class creates correct sentence from a context '<END>' without '<END>' in the beginning
        """
        corpus = ('my', 'favourite', 'subject', 'is', 'maths', '<END>', 'his',
                  'favourite', 'thing', 'is', 'music'
                  '<END>', 'i', 'have', 'a', 'favourite', 'film', '<END>',
                  'my', 'family', 'likes', 'avatar', '<END>', 'my',
                  'favourite', 'subject', 'is', 'music', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('<END>'), )

        first_generated = storage.get_id('my')
        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertNotEqual(storage.get_id('<END>'), actual[0])

        self.assertEqual(first_generated, actual[0])
        self.assertEqual(last_generated, actual[-1])
    def test_text_generator_no_context(self):
        """
        checks if the program can generate sentences without given context
        """

        corpus = ('cat', 'has', 'paws', '<END>', 'dogs', 'have', 'noses',
                  '<END>', 'cat', 'has', 'whiskers', '<END>')
        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)
        four = NGramTrie(4, encoded)

        context = (
            storage.get_id('cat'),
            storage.get_id('dogs'),
        )

        generator = BackOffGenerator(storage, trie, two, four)

        actual = generator.generate_text(context, 3)
        self.assertTrue(all(actual))
Пример #11
0
def main():
    text = ('I have a cat. His name is Bruno. '
            'I have a dog too. His name is Rex. '
            'Her name is Rex too.')

    corpus = tokenize_by_sentence(text)

    storage = WordStorage()
    storage.update(corpus)

    encoded = encode_text(storage, corpus)

    two = NGramTrie(2, encoded)
    trie = NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )

    generator = BackOffGenerator(storage, trie, two)

    expected = 'rex'
    actual = storage.get_word(generator._generate_next_word(context))

    print(f'TEXT:\n{text}')
    print(f'\nEXPECTED WORD AFTER name is IS {expected}')
    print(f'ACTUAL WORD AFTER name is IS {actual}')

    save_model(generator, 'model.txt')
    load_model('model.txt')

    return actual == expected
Пример #12
0
def realize_n_gram_text_generator(text):
    n_gram_storage = WordStorage()
    n_gram_storage.update(text)
    n_gram_context = (n_gram_storage.get_id('my'), n_gram_storage.get_id('dear'))
    n_gram_encoded = encode_text(n_gram_storage, text)
    n_gram_trie = NGramTrie(3, n_gram_encoded)
    n_gram_generator = NGramTextGenerator(n_gram_storage, n_gram_trie)
    n_gram_text_generated = n_gram_generator.generate_text(n_gram_context, 3)
    return decode_text(n_gram_storage, n_gram_text_generated)
Пример #13
0
def realize_likelihood_generator(text):
    likelihood_storage = WordStorage()
    likelihood_storage.update(text)
    context = (likelihood_storage.get_id('i'),
               likelihood_storage.get_id('shall'),)
    model = load_model('lab_4/likelihood_model.json')
    generator = LikelihoodBasedTextGenerator(model.word_storage, model.n_gram_trie)
    likelihood_text_generated = generator.generate_text(context, 3)

    return decode_text(likelihood_storage, likelihood_text_generated)
Пример #14
0
def realize_backoff_generator(text):
    backoff_storage = WordStorage()
    backoff_storage.update(text)
    backoff_encoded = encode_text(backoff_storage, text)
    two = NGramTrie(2, backoff_encoded)
    trie = NGramTrie(3, backoff_encoded)
    backoff_context = (backoff_storage.get_id('if'),
                       backoff_storage.get_id('you'),)
    backoff_generator = BackOffGenerator(backoff_storage, trie, two)
    backoff_text_generated = backoff_generator.generate_text(backoff_context, 3)

    return decode_text(backoff_storage, backoff_text_generated)
Пример #15
0
    def test_ngram_text_generator_identical_words(self):
        corpus = ('deadline', 'deadline', 'deadline', 'deadline', 'deadline',
                  '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(3, encoded)
        context = (storage.get_id('deadline'), storage.get_id('deadline'))

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(20 + len(context) + 1,
                         len(actual))  # +1 it is for <END>
    def test_ngram_text_generator_generate_next_word(self):
        """
        Checks that next word generates properly
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno',
                  'cat', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (word_storage.get_id('i'), word_storage.get_id('have'))
        expected = word_storage.get_id('a')
        actual = generator._generate_next_word(context)
        self.assertEqual(expected, actual)
    def test_ngram_text_generator_generate_sentence_properly(self):
        """
        generates correct output according to simple case
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (word_storage.get_id('i'), )

        end = word_storage.get_id('<END>')

        generator = NGramTextGenerator(word_storage, trie)
        actual = generator._generate_sentence(context)
        self.assertEqual(actual[-1], end)
Пример #18
0
    def test_get_most_frequent_gram_ideal(self):
        """
        Checks that most frequent ngram gets properly
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'i', 'have', 'a', 'bruno',
                  'cat', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (word_storage.get_id('i'), word_storage.get_id('have'))
        expected = (word_storage.get_id('i'), word_storage.get_id('have'),
                    word_storage.get_id('a'))
        actual = generator.get_most_frequent_gram(context)
        self.assertEqual(expected, actual)
 def test_word_storage_get_id_ideal(self):
     """
     ideal case for get_id
     """
     word_storage = WordStorage()
     word_storage.storage = {'word': 1}
     expected = 1
     actual = word_storage.get_id('word')
     self.assertEqual(expected, actual)
Пример #20
0
    def test_decode_text_upper_first_letter(self):
        '''
        Tests that number all the letters except
            first one in a sentence are in a lower case
        '''
        corpus = ('first', 'sentence', 'here', '<END>', 'second', 'sentence',
                  'here', '<END>', 'third', 'sentence', 'here', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded_text = encode_text(storage, corpus)
        trie = NGramTrie(3, encoded_text)
        context = (storage.get_id('first'), storage.get_id('sentence'))

        likelihood_generator = LikelihoodBasedTextGenerator(storage, trie)
        generated_encoded_text = likelihood_generator.generate_text(context, 1)
        decoded_text = decode_text(storage, generated_encoded_text)
        self.assertFalse(decoded_text[0][1:].isupper())
Пример #21
0
    def test_generate_next_word_short_context(self):
        corpus = ('bye', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        four = NGramTrie(4, encoded)
        trie = NGramTrie(3, encoded)
        two = NGramTrie(2, encoded)

        expected_word = storage.get_id('bye')
        context = (storage.get_id('<END>'),)

        generator = BackOffGenerator(storage, two, four, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
Пример #22
0
    def test_get_most_frequent_gram_no_such_context(self):
        """
        Checks that returns empty tuple with no context in the corpus
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)
        encoded = encode_text(word_storage, corpus)
        ngram = NGramTrie(3, encoded)
        generator = NGramTextGenerator(word_storage, ngram)

        context = (
            word_storage.get_id('i'),
            word_storage.get_id('name'),
        )  # there is no such context in ngrams
        expected = ()
        actual = generator.get_most_frequent_gram(context)
        self.assertEqual(expected, actual)
 def test_word_storage_put_word_ideal(self):
     """
     word is added to storage
     """
     word_storage = WordStorage()
     word = 'word'
     actual = word_storage._put_word(word)
     self.assertTrue(word in word_storage.storage)
     expected = word_storage.get_id(word)
     self.assertEqual(expected, actual)
    def test_ngram_text_generator_generate_next_word_no_such_context(self):
        """
        Checks that next word generates properly if no context found
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>')
        word_storage = WordStorage()
        word_storage.update(corpus)

        encoded = encode_text(word_storage, corpus)

        ngram = NGramTrie(3, encoded)

        generator = NGramTextGenerator(word_storage, ngram)
        context = (
            word_storage.get_id('i'),
            word_storage.get_id('name'),
        )  # there is no such context in ngrams, so return most frequent option
        expected_top_freq = word_storage.get_id('<END>')  # as it appears twice
        actual = generator._generate_next_word(context)
        self.assertEqual(expected_top_freq, actual)
Пример #25
0
    def test_text_generator_generate_sentence_proper_number_of_end(self):
        """
        Checks that class creates correct sentence with only one <END>
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'there', 'are', 'a',
                  'cat', 'outside', '<END>', 'here', 'is', 'a', 'cat',
                  'outside', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(3, encoded)
        context = (storage.get_id('a'), storage.get_id('is'),
                   storage.get_id('<END>'))

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(1, actual.count(storage.get_id('<END>')))
    def test_ngram_text_generator_generate_sentence_ideal(self):
        """
        first and last generated words as expected
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')
        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('i'), )

        first_generated = storage.get_id('have')
        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)
        self.assertEqual(actual[1], first_generated)
        self.assertEqual(actual[-1], last_generated)
Пример #27
0
    def test_float_result(self):
        """
            Checks that returned result is float
        """
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('i'), )
        word = storage.get_id('have')

        generator = LikelihoodBasedTextGenerator(storage, trie)

        actual = generator._calculate_maximum_likelihood(word, context)
        self.assertEqual(float, type(actual))
    def test_generate_next_word_larger_context(self):
        corpus = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                  'bruno', '<END>', 'i', 'have', 'a', 'dog', 'too', '<END>',
                  'his', 'name', 'is', 'rex', '<END>', 'her', 'name', 'is',
                  'rex', 'too', '<END>')

        storage = WordStorage()
        storage.update(corpus)

        encoded = encode_text(storage, corpus)

        trie = NGramTrie(4, encoded)

        expected_word = storage.get_id('bruno')
        context = (storage.get_id('his'), storage.get_id('name'),
                   storage.get_id('is'))

        generator = LikelihoodBasedTextGenerator(storage, trie)

        actual = generator._generate_next_word(context)
        self.assertEqual(expected_word, actual)
Пример #29
0
    def test_ngram_text_generator_duplicates_words(self):
        corpus = ('stop', 'it', 'stop', 'it', 'stop', 'it', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('stop'), )

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(20 + len(context) + 1, len(actual))
Пример #30
0
    def test_ngram_text_generator_end_at_the_beginning(self):
        """"
        should generate a sentence without <END> in any other position except the end of the sentence
        """
        corpus = ('i', 'like', 'to', 'read', '<END>', 'he', 'likes', 'to',
                  'read', 'too', 'i', 'like', 'a', 'book', 'called',
                  '"Harry Potter"', '<END>', 'he', 'likes', 'another', 'book',
                  '<END>', 'he', 'does', 'not', 'tell', 'me', 'name', '<END>')

        storage = WordStorage()
        storage.update(corpus)
        encoded = encode_text(storage, corpus)
        trie = NGramTrie(2, encoded)
        context = (storage.get_id('<END>'), )

        last_generated = storage.get_id('<END>')

        generator = NGramTextGenerator(storage, trie)
        actual = generator._generate_sentence(context)

        self.assertEqual(last_generated, actual[-1])
        self.assertEqual(1, actual.count(storage.get_id('<END>')))