예제 #1
0
def main():
    text = ('I have a cat. His name is Bruno. '
            'I have a dog too. His name is Rex. '
            'Her name is Rex too.')

    corpus = tokenize_by_sentence(text)

    storage = WordStorage()
    storage.update(corpus)

    encoded = encode_text(storage, corpus)

    two = NGramTrie(2, encoded)
    trie = NGramTrie(3, encoded)

    context = (
        storage.get_id('name'),
        storage.get_id('is'),
    )

    generator = BackOffGenerator(storage, trie, two)

    expected = 'rex'
    actual = storage.get_word(generator._generate_next_word(context))

    print(f'TEXT:\n{text}')
    print(f'\nEXPECTED WORD AFTER name is IS {expected}')
    print(f'ACTUAL WORD AFTER name is IS {actual}')

    save_model(generator, 'model.txt')
    load_model('model.txt')

    return actual == expected
    def test_load_model_takes_less_time(self):
        with open('lab_1/data.txt', 'r', encoding='utf-8') as big_file:
            big_data = big_file.read()
        tokenized_data = tokenize_by_sentence(big_data)
        storage = WordStorage()
        storage.update(tokenized_data)
        context = (
            storage.get_id('despite'),
            storage.get_id('the'),
        )

        start_time_generate = time()
        encoded = encode_text(storage, tokenized_data)
        trie = NGramTrie(3, encoded)
        generator = NGramTextGenerator(storage, trie)
        generated_text = generator.generate_text(context, 3)
        end_time_generate = time() - start_time_generate
        save_model(generator, 'model_training')

        start_time_saved = time()
        loaded_model = load_model('model_training')
        new_result = loaded_model.generate_text(context, 3)
        end_time_saved = time() - start_time_saved

        self.assertGreater(end_time_generate, end_time_saved)
        self.assertEqual(generated_text, new_result)
 def test_tokenize_by_sentence_empty_sentence(self):
     """
     Tests that tokenize_by_sentence function
         can handle empty sentence input
     """
     text = ''
     expected = ()
     actual = tokenize_by_sentence(text)
     self.assertEqual(expected, actual)
 def test_tokenize_by_sentence_inappropriate_sentence(self):
     """
     Tests that tokenize_by_sentence function
         can handle inappropriate sentence input
     """
     text = '$#&*@#$*#@)'
     expected = ()
     actual = tokenize_by_sentence(text)
     self.assertEqual(expected, actual)
 def test_tokenize_by_sentence_ideal(self):
     """
     Tests that tokenize_by_sentence function
         can handle ideal two sentence input
     """
     text = 'I have a cat.\nHis name is Bruno'
     expected = ('i', 'have', 'a', 'cat', '<END>', 'his', 'name', 'is',
                 'bruno', '<END>')
     actual = tokenize_by_sentence(text)
     self.assertEqual(expected, actual)
 def test_tokenize_by_sentence_complex(self):
     """
     Tests that tokenize_by_sentence function
         can handle complex split case
     """
     text = 'Mar#y wa$nted, to swim. However, she was afraid of sharks.'
     expected = ('mary', 'wanted', 'to', 'swim', '<END>', 'however', 'she',
                 'was', 'afraid', 'of', 'sharks', '<END>')
     actual = tokenize_by_sentence(text)
     self.assertEqual(expected, actual)
 def test_tokenize_by_sentence_dirty_text(self):
     """
     Tests that tokenize_by_sentence function
         can handle text filled with inappropriate characters
     """
     text = 'The first% sentence><. The sec&*ond sent@ence #.'
     expected = ('the', 'first', 'sentence', '<END>', 'the', 'second',
                 'sentence', '<END>')
     actual = tokenize_by_sentence(text)
     self.assertEqual(expected, actual)
 def test_tokenize_by_sentence_punctuation_marks(self):
     """
     Tests that tokenize_by_sentence function
         can process and ignore different punctuation marks
     """
     text = 'The, first sentence - nice? The second sentence: bad!'
     expected = ('the', 'first', 'sentence', 'nice', '<END>', 'the',
                 'second', 'sentence', 'bad', '<END>')
     actual = tokenize_by_sentence(text)
     self.assertEqual(expected, actual)
 def test_tokenize_by_sentence_adds_ends(self):
     '''
     Tests that number of "<END>" corresponds
         the number of sentences
     '''
     text = '''There are many big and small libraries everywhere in our country. 
               They have millions of books in different languages. 
               You can find there the oldest and the newest books.'''
     expected_end_num = 3
     actual_end_num = tokenize_by_sentence(text).count('<END>')
     self.assertEqual(expected_end_num, actual_end_num)
    def test_tokenize_text_lower_case(self):
        '''
        Tests that tokens in encoded text
            are all in a lower case (except "<END>")
        '''
        text = '''There are many big and small libraries everywhere in our country. 
                  They have millions of books in different languages. 
                  You can find there the oldest and the newest books.'''
        tokenized_text = tokenize_by_sentence(text)

        actual = True
        for token in tokenized_text:
            if token != '<END>' and token.isupper():
                actual = False
        self.assertTrue(actual)
예제 #11
0
"""
Text generator implementation starter
"""

from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator
from lab_4.main import tokenize_by_sentence, encode_text

if __name__ == '__main__':
    # here goes your function calls
    first_text = open('lab_3/Frank_Baum.txt', encoding="utf-8")
    first_text_tokenized = tokenize_by_sentence(first_text.read())

    word_storage = WordStorage()
    word_storage.update(first_text_tokenized)

    encoded = encode_text(word_storage, first_text_tokenized)

    n_gram_trie = NGramTrie(2, encoded)
    generator = NGramTextGenerator(word_storage, n_gram_trie)

    RESULT = generator.generate_text(encoded[16:17], 3)
    #print(RESULT)

    assert RESULT, "Not working"
예제 #12
0
"""
Generator of the text starter
"""

from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage
from lab_4.ngrams.ngram_trie import NGramTrie
from lab_4.main import encode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':

    text = tokenize_by_sentence(
        """Hi everyone! Nice to meet you again. What are you doing in my laboratory work?
                                    You are very nice person, do you know it? To be honest, I can't stand programming.
                                    But it doesn't depend on you! It's my personal problem and I don't know how to
                                    solve it... It doesn't matter right now""")

    word_storage = WordStorage()
    word_storage.update(text)

    encoded_text = encode_text(word_storage, text)

    n_gram_trie = NGramTrie(3, encoded_text)

    generator_of_text = NGramTextGenerator(word_storage, n_gram_trie)
    context = word_storage.get_id('on'), word_storage.get_id('you')

    formed_ids = generator_of_text.generate_text(context, 1)
    formed_text = []
예제 #13
0
"""
Text generator implementation starter
"""

from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator
from lab_4.main import tokenize_by_sentence, encode_text

if __name__ == '__main__':
    text = 'I have a cat. His name is Bruno. I have a dog. Her name is Rex. Her name is Rex too.'
    text_tokenized = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(text_tokenized)

    encoded = encode_text(word_storage, text_tokenized)

    trie = NGramTrie(2, encoded)
    context = (word_storage.get_id('i'), )
    generator = NGramTextGenerator(word_storage, trie)

    RESULT = generator.generate_text(context, 4)

    print(RESULT)

    assert RESULT, "Not working"
예제 #14
0
from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    TEXT = 'I like flowers.\nMy mom likes flowers too.\nHer favourite flower is rose.\nMy favourite flower is rose too.'
    corpus = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(corpus)

    encoded_corpus = encode_text(word_storage, corpus)

    ngrams = NGramTrie(2, encoded_corpus)

    text_generator = NGramTextGenerator(word_storage, ngrams)
    gen_text = text_generator.generate_text((1, ), 2)

    end = word_storage.get_id('<END>')
    actual = gen_text.count(end)
    RESULT = 2
    print(actual)
    assert RESULT == actual, 'not working'
예제 #15
0
from lab_4.ngrams.ngram_trie import NGramTrie
from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage
from lab_4.main import encode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    text = "This is a dog. It likes running. This is a cat. It likes sleeping. Everyone likes sleeping too."
    text_in_tokens = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(text_in_tokens)

    encoded_text = encode_text(word_storage, text_in_tokens)

    n_gram_trie = NGramTrie(2, encoded_text)
    context = (word_storage.get_id('likes'),)
    text_generator = NGramTextGenerator(word_storage, n_gram_trie)

    RESULT = text_generator.generate_text(context, 4)

    print(RESULT)

    assert RESULT, "Someting went worng.."
예제 #16
0
from lab_4 import main

text = main.tokenize_by_sentence('I have a dog. It is name is Bruno.')
storage = main.WordStorage()
text_encoded = main.encode_text(storage, text)
trie = main.NGramTrie(2, text_encoded)
text_generator = main.NGramTextGenerator(storage, trie)
new_text = text_generator.generate_text((1,), 1)

end = storage.get_id('<END>')
actual = new_text.count(end)
RESULT = 1

assert RESULT == actual, 'not generating'
예제 #17
0
"""
Text generator
"""

from lab_4.ngrams.ngram_trie import NGramTrie
from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage, LikelihoodBasedTextGenerator
from lab_4.main import encode_text, decode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    TEXT = 'I have a cat. His name is Bruno. I have a dog too. His name is Rex. Her name is Rex too.'
    tokenized_text = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(tokenized_text)

    encoded = encode_text(word_storage, tokenized_text)

    trie = NGramTrie(3, encoded)
    context = (
        word_storage.get_id('name'),
        word_storage.get_id('is'),
    )

    generator = NGramTextGenerator(word_storage, trie)
    generated_text = generator.generate_text(context, 2)

    gen_likelihood = LikelihoodBasedTextGenerator(word_storage, trie)
    gen_text = gen_likelihood.generate_text(context, 2)
    decoded_text = decode_text(word_storage, gen_text)
예제 #18
0
"""
Text generator implementation starter
"""

from lab_4.main import WordStorage, NGramTrie, NGramTextGenerator
from lab_4.main import tokenize_by_sentence, encode_text

if __name__ == '__main__':
    TEXT = 'I have a cat. His name is Bruno. I have a dog. Her name is Rex. Her name is Rex too.'
    text_tokenized = tokenize_by_sentence(TEXT)

    word_storage = WordStorage()
    word_storage.update(text_tokenized)

    encoded = encode_text(word_storage, text_tokenized)

    trie = NGramTrie(2, encoded)
    context = (word_storage.get_id('i'), )
    generator = NGramTextGenerator(word_storage, trie)

    RESULT = generator.generate_text(context, 4)

    print(RESULT)

    assert RESULT, "Not working"
예제 #19
0
"""
Lab 4 implementation starter
"""

from lab_4.main import tokenize_by_sentence, WordStorage, encode_text, NGramTextGenerator
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    text = 'I have a dog.\nHis name is Will'
    tokenize_text = tokenize_by_sentence(text)
    print(tokenize_text)

    storage = WordStorage()
    storage.update(tokenize_text)
    print(storage)

    encode = encode_text(storage, tokenize_text)
    print(encode)

    n_gram_trie = NGramTrie(2, encode)
    print(n_gram_trie)
    generator = NGramTextGenerator(storage, n_gram_trie)
    context = (storage.get_id('a'), )
    print(context)

    RESULT = generator.generate_text(context, 3)
    print(RESULT)
    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT, 'Not working'
예제 #20
0
from lab_4 import main

if __name__ == '__main__':
    text = 'I have a cat.\nHis name is Bruno'
    corpus = main.tokenize_by_sentence(text)
    print(corpus)

    storage = main.WordStorage()
    storage.update(corpus)
    print(storage.storage)

    encoded = main.encode_text(storage, corpus)
    print(encoded)

    RESULT = encoded
    assert RESULT == (1, 2, 3, 4, 5, 6, 7, 8, 9, 5), 'Not working'


예제 #21
0
"""
Lab 4 implementation starter
"""

from lab_4.main import BackOffGenerator, encode_text, WordStorage, decode_text, tokenize_by_sentence
from lab_4.ngrams.ngram_trie import NGramTrie

if __name__ == '__main__':
    with open('lab_3/Frank_Baum.txt', 'r', encoding='utf-8') as file_frank:
        corpus = tokenize_by_sentence(file_frank.read())

    storage = WordStorage()
    storage.update(corpus)
    encoded = encode_text(storage, corpus)

    trie = NGramTrie(3, encoded)
    four = NGramTrie(4, encoded)

    context = (
        storage.get_id('when'),
        storage.get_id('the'),
    )

    generator = BackOffGenerator(storage, four, trie)
    generated_text = generator.generate_text(context, 5)
    RESULT = decode_text(storage, generated_text)

    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT, 'Encoding not working'
예제 #22
0
"""
Text generator
"""

from lab_4.ngrams.ngram_trie import NGramTrie
from lab_4.main import tokenize_by_sentence
from lab_4.main import WordStorage
from lab_4.main import encode_text
from lab_4.main import NGramTextGenerator

if __name__ == '__main__':
    text = 'I have a cat. His name is Bruno. I have a dog too. ' \
           'His name is Rex. Her name is Rex too'
    corpus = tokenize_by_sentence(text)

    word_storage = WordStorage()
    word_storage.update(corpus)

    encoded_text = encode_text(word_storage, corpus)

    n_gram_trie = NGramTrie(2, encoded_text)

    n_gram_text_generator = NGramTextGenerator(word_storage, n_gram_trie)

    context = (word_storage.get_id('i'), word_storage.get_id('have'))

    text_generated = n_gram_text_generator.generate_text(context, 2)
    output_text = []

    for word_id in text_generated:
        word = word_storage.get_word(word_id)