예제 #1
0
    def test_calculate_log_probabilities_empty_frequencies(self):
        ngram = NGramTrie(2)
        ngram.n_gram_frequencies = {}

        actual = ngram.calculate_log_probabilities()
        self.assertEqual(ngram.n_gram_log_probabilities, {})
        self.assertEqual(1, actual)
예제 #2
0
    def test_calculate_log_probabilities_one_bi_gram(self):
        ngram = NGramTrie(2)
        ngram.n_gram_frequencies = {(1, 2): 10}

        actual = ngram.calculate_log_probabilities()
        self.assertEqual(ngram.n_gram_log_probabilities[(1, 2)], 0.0)
        self.assertEqual(0, actual)
예제 #3
0
 def test_fill_n_grams_not_tuple(self):
     ngram = NGramTrie(2)
     sentences = [(1, 2, 3, 4, 5)]
     expected = ()
     actual = ngram.fill_n_grams(sentences)
     self.assertEqual(1, actual)
     self.assertEqual(ngram.n_grams, expected)
예제 #4
0
 def test_top_n_grams_more(self):
     ngram = NGramTrie(2)
     top_n = 2000000
     ngram.n_gram_frequencies = {(1, 2): 100, (2, 3): 123, (3, 4): 12345}
     expected = ((3, 4), (2, 3), (1, 2))
     actual = ngram.top_n_grams(top_n)
     self.assertEqual(expected, actual)
예제 #5
0
 def test_top_n_grams_inappropriate(self):
     ngram = NGramTrie(2)
     bad_inputs = [[], (), {}, None, True, '', -1, 0, 9.22]
     expected = ()
     for bad_input in bad_inputs:
         actual = ngram.top_n_grams(bad_input)
         self.assertEqual(expected, actual)
예제 #6
0
    def test_probability_language_detector_calls_required_method(self, mock):
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')

        text_unk = tokenize_by_sentence(unknown_file.read())
        text_ger = tokenize_by_sentence(german_file.read())
        text_eng = tokenize_by_sentence(english_file.read())
        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(text_eng)
        letter_storage.update(text_ger)
        letter_storage.update(text_unk)

        eng_encoded = encode_corpus(letter_storage, text_eng)
        unk_encoded = encode_corpus(letter_storage, text_unk)
        ger_encoded = encode_corpus(letter_storage, text_ger)

        language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000)
        language_detector.new_language(eng_encoded, 'english')
        language_detector.new_language(ger_encoded, 'german')

        ngram_unknown = NGramTrie(4)
        ngram_unknown.fill_n_grams(unk_encoded)

        language_detector.detect_language(ngram_unknown.n_grams)
        self.assertTrue(mock.called)
예제 #7
0
 def test_fill_from_sentence_none(self):
     ngram = NGramTrie(2)
     sentence = None
     res = ngram.fill_from_sentence(sentence)
     print(res)
     expected_res = {}
     self.assertEqual(ngram.gram_frequencies, expected_res)
예제 #8
0
 def test_fill_n_grams_none(self):
     ngram = NGramTrie(2)
     sentences = None
     expected = ()
     actual = ngram.fill_n_grams(sentences)
     self.assertEqual(1, actual)
     self.assertEqual(ngram.n_grams, expected)
예제 #9
0
 def test_fill_n_grams_empty(self):
     ngram = NGramTrie(2)
     sentences = ()
     expected = ()
     actual = ngram.fill_n_grams(sentences)
     self.assertEqual(0, actual)
     self.assertEqual(ngram.n_grams, expected)
예제 #10
0
 def test_calculate_log_probabilities_ideal(self):
     ngram = NGramTrie(2)
     ngram.gram_frequencies = {(1, 2): 10, (1, 3): 2, (2, 5): 5}
     first_prob = math.log(10 / 12)
     second_prob = math.log(2 / 12)
     ngram.calculate_log_probabilities()
     self.assertEqual(ngram.gram_log_probabilities[(1, 2)], first_prob)
     self.assertEqual(ngram.gram_log_probabilities[(1, 3)], second_prob)
예제 #11
0
 def test_fill_n_grams_duplcicates_ideal(self):
     ngram = NGramTrie(2)
     sentences = (((1, 2, 1, 2, 1, 2), (10, 11, 12)), )
     expected = ((((1, 2), (2, 1), (1, 2), (2, 1), (1, 2)), ((10, 11),
                                                             (11, 12))), )
     actual = ngram.fill_n_grams(sentences)
     self.assertEqual(0, actual)
     self.assertEqual(ngram.n_grams, expected)
예제 #12
0
 def test_fill_n_grams_ideal(self):
     ngram = NGramTrie(2)
     text = (((1, 2, 3, 4, 5), (2, 3, 4, 5)), )
     expected = ((((1, 2), (2, 3), (3, 4), (4, 5)), ((2, 3), (3, 4),
                                                     (4, 5))), )
     actual = ngram.fill_n_grams(text)
     self.assertEqual(0, actual)
     self.assertEqual(ngram.n_grams, expected)
예제 #13
0
    def test_get_ngrams_frequencies_from_sentence_none(self):
        ngram = NGramTrie(2)
        sentences = None
        ngram.fill_n_grams(sentences)

        expected = {}
        actual = ngram.calculate_n_grams_frequencies()
        self.assertEqual(expected, ngram.n_gram_frequencies)
        self.assertEqual(1, actual)
예제 #14
0
    def test_calculate_n_grams_frequencies_empty(self):
        ngram = NGramTrie(2)
        sentences = ()
        ngram.fill_n_grams(sentences)

        expected = {}
        actual = ngram.calculate_n_grams_frequencies()
        self.assertEqual(expected, ngram.n_gram_frequencies)
        self.assertEqual(1, actual)
예제 #15
0
    def test_calculate_n_grams_frequencies_duplcicates_ideal(self):
        ngram = NGramTrie(2)
        sentences = (((1, 2, 1, 2, 1, 2), (1, 2)), )
        ngram.fill_n_grams(sentences)

        expected = {(1, 2): 4, (2, 1): 2}
        actual = ngram.calculate_n_grams_frequencies()
        self.assertEqual(expected, ngram.n_gram_frequencies)
        self.assertEqual(0, actual)
예제 #16
0
    def test_calculate_n_grams_frequencies_ideal(self):
        ngram = NGramTrie(2)
        sentences = (((1, 2, 3, 4, 5), ), )
        ngram.fill_n_grams(sentences)

        expected = {(1, 2): 1, (2, 3): 1, (3, 4): 1, (4, 5): 1}
        actual = ngram.calculate_n_grams_frequencies()
        self.assertEqual(expected, ngram.n_gram_frequencies)
        self.assertEqual(0, actual)
예제 #17
0
 def test_predict_next_sentence_more_words_ideal(self):
     ngram = NGramTrie(2)
     ngram.gram_log_probabilities = {
         (1, 2): -0.18,
         (1, 3): -1.79,
         (2, 3): -1,
         (2, 4): -2,
         (3, 4): -0.1,
         (3, 5): -1.8
     }
     actual_res = ngram.predict_next_sentence((1, ))
     expected_res = [1, 2, 3, 4]
     self.assertEqual(actual_res, expected_res)
예제 #18
0
    def test_probability_language_detector_calculate_probability_incorrect_text(self):
        language_detector = ProbabilityLanguageDetector((2, 3), 10)
        bad_inputs = [[], {}, '', None, True, set()]
        ngram_trie = NGramTrie(5)

        expected = -1.0
        for bad_input in bad_inputs:
            actual = language_detector._calculate_sentence_probability(ngram_trie,
                                                                       bad_input)
            self.assertEqual(expected, actual)
예제 #19
0
    def test_probability_language_detector_calculate_probability_ideal(self):
        print('launching test')
        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8')

        english_text = tokenize_by_sentence(english_file.read())
        german_text = tokenize_by_sentence(german_file.read())
        unknown_text = tokenize_by_sentence(unknown_file.read())

        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(english_text)
        letter_storage.update(german_text)
        letter_storage.update(unknown_text)

        english_encoded = encode_corpus(letter_storage, english_text)
        german_encoded = encode_corpus(letter_storage, german_text)
        unknown_encoded = encode_corpus(letter_storage, unknown_text)

        language_detector = ProbabilityLanguageDetector((3,), 1000)
        language_detector.new_language(english_encoded, 'english')
        language_detector.new_language(german_encoded, 'german')

        n3_gram_trie_english = language_detector.n_gram_storages['english'][3]
        n3_gram_trie_german = language_detector.n_gram_storages['german'][3]

        n3_gram_unknown = NGramTrie(3)
        n3_gram_unknown.fill_n_grams(unknown_encoded)

        english_prob = language_detector._calculate_sentence_probability(n3_gram_trie_english,
                                                                         n3_gram_unknown.n_grams)
        german_prob = language_detector._calculate_sentence_probability(n3_gram_trie_german,
                                                                        n3_gram_unknown.n_grams)
        print(f'English_sentence_prob: {english_prob}')
        print(f'Deutsch_sentence_prob: {german_prob}')
        self.assertTrue(english_prob > german_prob)
예제 #20
0
    def test_probability_language_detector_several_ngrams_case(self):
        language_detector = ProbabilityLanguageDetector((3, 5), 1000)

        english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8')
        german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8')
        unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt',
                            encoding='utf-8')

        eng_text = tokenize_by_sentence(english_file.read())
        ger_text = tokenize_by_sentence(german_file.read())
        unk_text = tokenize_by_sentence(unknown_file.read())

        english_file.close()
        german_file.close()
        unknown_file.close()

        letter_storage = LetterStorage()
        letter_storage.update(eng_text)
        letter_storage.update(ger_text)
        letter_storage.update(unk_text)

        english_encoded = encode_corpus(letter_storage, eng_text)
        german_encoded = encode_corpus(letter_storage, ger_text)
        unknown_encoded = encode_corpus(letter_storage, unk_text)

        language_detector.new_language(english_encoded, 'english')
        language_detector.new_language(german_encoded, 'german')

        eng_prob = language_detector.n_gram_storages['english'][5]
        ger_prob = language_detector.n_gram_storages['german'][5]

        ngram_trie = NGramTrie(5)
        ngram_trie.fill_n_grams(unknown_encoded)

        eng = language_detector._calculate_sentence_probability(
            eng_prob, ngram_trie.n_grams)
        ger = language_detector._calculate_sentence_probability(
            ger_prob, ngram_trie.n_grams)
        self.assertTrue(ger > eng)
예제 #21
0
 def test_fill_from_sentence_ideal(self):
     ngram = NGramTrie(2)
     sentence = (1, 2, 3, 4, 5)
     ngram.fill_from_sentence(sentence)
     expected_res = {(1, 2): 1, (2, 3): 1, (3, 4): 1, (4, 5): 1}
     self.assertEqual(ngram.gram_frequencies, expected_res)
예제 #22
0
 def test_predict_next_sentence_no_match(self):
     ngram = NGramTrie(2)
     ngram.gram_log_probabilities = {(4, 2): -0.18, (4, 3): -1.79}
     actual_res = ngram.predict_next_sentence((1, ))
     expected_res = [1]
     self.assertEqual(actual_res, expected_res)
예제 #23
0
    unknown_file.close()

    letter_storage = LetterStorage()
    letter_storage.update(text_eng)
    letter_storage.update(text_ger)
    letter_storage.update(text_unk)

    eng_encoded = encode_corpus(letter_storage, text_eng)
    unk_encoded = encode_corpus(letter_storage, text_unk)
    ger_encoded = encode_corpus(letter_storage, text_ger)

    language_detector = LanguageDetector((3, 4, 5), 1000)
    language_detector.new_language(eng_encoded, 'english')
    language_detector.new_language(ger_encoded, 'german')

    ngram_unknown = NGramTrie(4)
    ngram_unknown.fill_n_grams(unk_encoded)

    language_log_probability_dict = language_detector.detect_language(ngram_unknown.n_grams)

    if language_log_probability_dict['german'] >\
language_log_probability_dict['english']:
        RESULT = 'english'
    else:
        RESULT = 'german'

    print('this is a {} text.'.format(RESULT))

    # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST
    assert RESULT == 'german', 'Not working'
예제 #24
0
 def test_ngram_trie_check_creation(self):
     ngram = NGramTrie(2)
     self.assertEqual(ngram.size, 2)
     self.assertEqual(ngram.gram_frequencies, {})
     self.assertEqual(ngram.gram_log_probabilities, {})
예제 #25
0
 def test_predict_next_sentence_wrong_size(self):
     ngram = NGramTrie(2)
     actual_res = ngram.predict_next_sentence((1, 2))
     expected_res = []
     self.assertEqual(actual_res, expected_res)
예제 #26
0
 def test_predict_next_sentence_simple_ideal(self):
     ngram = NGramTrie(2)
     ngram.gram_log_probabilities = {(1, 2): -0.18, (1, 3): -1.79}
     actual_res = ngram.predict_next_sentence((1, ))
     expected_res = [1, 2]
     self.assertEqual(actual_res, expected_res)
예제 #27
0
 def test_calculate_log_probabilities_empty(self):
     ngram = NGramTrie(2)
     ngram.gram_frequencies = {}
     ngram.calculate_log_probabilities()
     self.assertEqual(ngram.gram_log_probabilities, {})
예제 #28
0
 def test_fill_from_sentence_duplcicates_ideal(self):
     ngram = NGramTrie(2)
     sentence = (1, 2, 1, 2, 1, 2)
     ngram.fill_from_sentence(sentence)
     expected_res = {(1, 2): 3, (2, 1): 2}
     self.assertEqual(ngram.gram_frequencies, expected_res)
예제 #29
0
 def test_fill_from_sentence_not_tuple(self):
     ngram = NGramTrie(2)
     sentence = [1, 2, 3, 4, 5]
     ngram.fill_from_sentence(sentence)
     expected_res = {}
     self.assertEqual(ngram.gram_frequencies, expected_res)
예제 #30
0
 def test_fill_from_sentence_empty(self):
     ngram = NGramTrie(2)
     sentence = ()
     ngram.fill_from_sentence(sentence)
     expected_res = {}
     self.assertEqual(ngram.gram_frequencies, expected_res)