def test_calculate_log_probabilities_empty_frequencies(self): ngram = NGramTrie(2) ngram.n_gram_frequencies = {} actual = ngram.calculate_log_probabilities() self.assertEqual(ngram.n_gram_log_probabilities, {}) self.assertEqual(1, actual)
def test_calculate_log_probabilities_one_bi_gram(self): ngram = NGramTrie(2) ngram.n_gram_frequencies = {(1, 2): 10} actual = ngram.calculate_log_probabilities() self.assertEqual(ngram.n_gram_log_probabilities[(1, 2)], 0.0) self.assertEqual(0, actual)
def test_fill_n_grams_not_tuple(self): ngram = NGramTrie(2) sentences = [(1, 2, 3, 4, 5)] expected = () actual = ngram.fill_n_grams(sentences) self.assertEqual(1, actual) self.assertEqual(ngram.n_grams, expected)
def test_top_n_grams_more(self): ngram = NGramTrie(2) top_n = 2000000 ngram.n_gram_frequencies = {(1, 2): 100, (2, 3): 123, (3, 4): 12345} expected = ((3, 4), (2, 3), (1, 2)) actual = ngram.top_n_grams(top_n) self.assertEqual(expected, actual)
def test_top_n_grams_inappropriate(self): ngram = NGramTrie(2) bad_inputs = [[], (), {}, None, True, '', -1, 0, 9.22] expected = () for bad_input in bad_inputs: actual = ngram.top_n_grams(bad_input) self.assertEqual(expected, actual)
def test_probability_language_detector_calls_required_method(self, mock): unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8') german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8') english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8') text_unk = tokenize_by_sentence(unknown_file.read()) text_ger = tokenize_by_sentence(german_file.read()) text_eng = tokenize_by_sentence(english_file.read()) english_file.close() german_file.close() unknown_file.close() letter_storage = LetterStorage() letter_storage.update(text_eng) letter_storage.update(text_ger) letter_storage.update(text_unk) eng_encoded = encode_corpus(letter_storage, text_eng) unk_encoded = encode_corpus(letter_storage, text_unk) ger_encoded = encode_corpus(letter_storage, text_ger) language_detector = ProbabilityLanguageDetector((3, 4, 5), 1000) language_detector.new_language(eng_encoded, 'english') language_detector.new_language(ger_encoded, 'german') ngram_unknown = NGramTrie(4) ngram_unknown.fill_n_grams(unk_encoded) language_detector.detect_language(ngram_unknown.n_grams) self.assertTrue(mock.called)
def test_fill_from_sentence_none(self): ngram = NGramTrie(2) sentence = None res = ngram.fill_from_sentence(sentence) print(res) expected_res = {} self.assertEqual(ngram.gram_frequencies, expected_res)
def test_fill_n_grams_none(self): ngram = NGramTrie(2) sentences = None expected = () actual = ngram.fill_n_grams(sentences) self.assertEqual(1, actual) self.assertEqual(ngram.n_grams, expected)
def test_fill_n_grams_empty(self): ngram = NGramTrie(2) sentences = () expected = () actual = ngram.fill_n_grams(sentences) self.assertEqual(0, actual) self.assertEqual(ngram.n_grams, expected)
def test_calculate_log_probabilities_ideal(self): ngram = NGramTrie(2) ngram.gram_frequencies = {(1, 2): 10, (1, 3): 2, (2, 5): 5} first_prob = math.log(10 / 12) second_prob = math.log(2 / 12) ngram.calculate_log_probabilities() self.assertEqual(ngram.gram_log_probabilities[(1, 2)], first_prob) self.assertEqual(ngram.gram_log_probabilities[(1, 3)], second_prob)
def test_fill_n_grams_duplcicates_ideal(self): ngram = NGramTrie(2) sentences = (((1, 2, 1, 2, 1, 2), (10, 11, 12)), ) expected = ((((1, 2), (2, 1), (1, 2), (2, 1), (1, 2)), ((10, 11), (11, 12))), ) actual = ngram.fill_n_grams(sentences) self.assertEqual(0, actual) self.assertEqual(ngram.n_grams, expected)
def test_fill_n_grams_ideal(self): ngram = NGramTrie(2) text = (((1, 2, 3, 4, 5), (2, 3, 4, 5)), ) expected = ((((1, 2), (2, 3), (3, 4), (4, 5)), ((2, 3), (3, 4), (4, 5))), ) actual = ngram.fill_n_grams(text) self.assertEqual(0, actual) self.assertEqual(ngram.n_grams, expected)
def test_get_ngrams_frequencies_from_sentence_none(self): ngram = NGramTrie(2) sentences = None ngram.fill_n_grams(sentences) expected = {} actual = ngram.calculate_n_grams_frequencies() self.assertEqual(expected, ngram.n_gram_frequencies) self.assertEqual(1, actual)
def test_calculate_n_grams_frequencies_empty(self): ngram = NGramTrie(2) sentences = () ngram.fill_n_grams(sentences) expected = {} actual = ngram.calculate_n_grams_frequencies() self.assertEqual(expected, ngram.n_gram_frequencies) self.assertEqual(1, actual)
def test_calculate_n_grams_frequencies_duplcicates_ideal(self): ngram = NGramTrie(2) sentences = (((1, 2, 1, 2, 1, 2), (1, 2)), ) ngram.fill_n_grams(sentences) expected = {(1, 2): 4, (2, 1): 2} actual = ngram.calculate_n_grams_frequencies() self.assertEqual(expected, ngram.n_gram_frequencies) self.assertEqual(0, actual)
def test_calculate_n_grams_frequencies_ideal(self): ngram = NGramTrie(2) sentences = (((1, 2, 3, 4, 5), ), ) ngram.fill_n_grams(sentences) expected = {(1, 2): 1, (2, 3): 1, (3, 4): 1, (4, 5): 1} actual = ngram.calculate_n_grams_frequencies() self.assertEqual(expected, ngram.n_gram_frequencies) self.assertEqual(0, actual)
def test_predict_next_sentence_more_words_ideal(self): ngram = NGramTrie(2) ngram.gram_log_probabilities = { (1, 2): -0.18, (1, 3): -1.79, (2, 3): -1, (2, 4): -2, (3, 4): -0.1, (3, 5): -1.8 } actual_res = ngram.predict_next_sentence((1, )) expected_res = [1, 2, 3, 4] self.assertEqual(actual_res, expected_res)
def test_probability_language_detector_calculate_probability_incorrect_text(self): language_detector = ProbabilityLanguageDetector((2, 3), 10) bad_inputs = [[], {}, '', None, True, set()] ngram_trie = NGramTrie(5) expected = -1.0 for bad_input in bad_inputs: actual = language_detector._calculate_sentence_probability(ngram_trie, bad_input) self.assertEqual(expected, actual)
def test_probability_language_detector_calculate_probability_ideal(self): print('launching test') english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8') german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8') unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8') english_text = tokenize_by_sentence(english_file.read()) german_text = tokenize_by_sentence(german_file.read()) unknown_text = tokenize_by_sentence(unknown_file.read()) english_file.close() german_file.close() unknown_file.close() letter_storage = LetterStorage() letter_storage.update(english_text) letter_storage.update(german_text) letter_storage.update(unknown_text) english_encoded = encode_corpus(letter_storage, english_text) german_encoded = encode_corpus(letter_storage, german_text) unknown_encoded = encode_corpus(letter_storage, unknown_text) language_detector = ProbabilityLanguageDetector((3,), 1000) language_detector.new_language(english_encoded, 'english') language_detector.new_language(german_encoded, 'german') n3_gram_trie_english = language_detector.n_gram_storages['english'][3] n3_gram_trie_german = language_detector.n_gram_storages['german'][3] n3_gram_unknown = NGramTrie(3) n3_gram_unknown.fill_n_grams(unknown_encoded) english_prob = language_detector._calculate_sentence_probability(n3_gram_trie_english, n3_gram_unknown.n_grams) german_prob = language_detector._calculate_sentence_probability(n3_gram_trie_german, n3_gram_unknown.n_grams) print(f'English_sentence_prob: {english_prob}') print(f'Deutsch_sentence_prob: {german_prob}') self.assertTrue(english_prob > german_prob)
def test_probability_language_detector_several_ngrams_case(self): language_detector = ProbabilityLanguageDetector((3, 5), 1000) english_file = open('lab_3/Frank_Baum.txt', encoding='utf-8') german_file = open('lab_3/Thomas_Mann.txt', encoding='utf-8') unknown_file = open('lab_3/unknown_Arthur_Conan_Doyle.txt', encoding='utf-8') eng_text = tokenize_by_sentence(english_file.read()) ger_text = tokenize_by_sentence(german_file.read()) unk_text = tokenize_by_sentence(unknown_file.read()) english_file.close() german_file.close() unknown_file.close() letter_storage = LetterStorage() letter_storage.update(eng_text) letter_storage.update(ger_text) letter_storage.update(unk_text) english_encoded = encode_corpus(letter_storage, eng_text) german_encoded = encode_corpus(letter_storage, ger_text) unknown_encoded = encode_corpus(letter_storage, unk_text) language_detector.new_language(english_encoded, 'english') language_detector.new_language(german_encoded, 'german') eng_prob = language_detector.n_gram_storages['english'][5] ger_prob = language_detector.n_gram_storages['german'][5] ngram_trie = NGramTrie(5) ngram_trie.fill_n_grams(unknown_encoded) eng = language_detector._calculate_sentence_probability( eng_prob, ngram_trie.n_grams) ger = language_detector._calculate_sentence_probability( ger_prob, ngram_trie.n_grams) self.assertTrue(ger > eng)
def test_fill_from_sentence_ideal(self): ngram = NGramTrie(2) sentence = (1, 2, 3, 4, 5) ngram.fill_from_sentence(sentence) expected_res = {(1, 2): 1, (2, 3): 1, (3, 4): 1, (4, 5): 1} self.assertEqual(ngram.gram_frequencies, expected_res)
def test_predict_next_sentence_no_match(self): ngram = NGramTrie(2) ngram.gram_log_probabilities = {(4, 2): -0.18, (4, 3): -1.79} actual_res = ngram.predict_next_sentence((1, )) expected_res = [1] self.assertEqual(actual_res, expected_res)
unknown_file.close() letter_storage = LetterStorage() letter_storage.update(text_eng) letter_storage.update(text_ger) letter_storage.update(text_unk) eng_encoded = encode_corpus(letter_storage, text_eng) unk_encoded = encode_corpus(letter_storage, text_unk) ger_encoded = encode_corpus(letter_storage, text_ger) language_detector = LanguageDetector((3, 4, 5), 1000) language_detector.new_language(eng_encoded, 'english') language_detector.new_language(ger_encoded, 'german') ngram_unknown = NGramTrie(4) ngram_unknown.fill_n_grams(unk_encoded) language_log_probability_dict = language_detector.detect_language(ngram_unknown.n_grams) if language_log_probability_dict['german'] >\ language_log_probability_dict['english']: RESULT = 'english' else: RESULT = 'german' print('this is a {} text.'.format(RESULT)) # DO NOT REMOVE NEXT LINE - KEEP IT INTENTIONALLY LAST assert RESULT == 'german', 'Not working'
def test_ngram_trie_check_creation(self): ngram = NGramTrie(2) self.assertEqual(ngram.size, 2) self.assertEqual(ngram.gram_frequencies, {}) self.assertEqual(ngram.gram_log_probabilities, {})
def test_predict_next_sentence_wrong_size(self): ngram = NGramTrie(2) actual_res = ngram.predict_next_sentence((1, 2)) expected_res = [] self.assertEqual(actual_res, expected_res)
def test_predict_next_sentence_simple_ideal(self): ngram = NGramTrie(2) ngram.gram_log_probabilities = {(1, 2): -0.18, (1, 3): -1.79} actual_res = ngram.predict_next_sentence((1, )) expected_res = [1, 2] self.assertEqual(actual_res, expected_res)
def test_calculate_log_probabilities_empty(self): ngram = NGramTrie(2) ngram.gram_frequencies = {} ngram.calculate_log_probabilities() self.assertEqual(ngram.gram_log_probabilities, {})
def test_fill_from_sentence_duplcicates_ideal(self): ngram = NGramTrie(2) sentence = (1, 2, 1, 2, 1, 2) ngram.fill_from_sentence(sentence) expected_res = {(1, 2): 3, (2, 1): 2} self.assertEqual(ngram.gram_frequencies, expected_res)
def test_fill_from_sentence_not_tuple(self): ngram = NGramTrie(2) sentence = [1, 2, 3, 4, 5] ngram.fill_from_sentence(sentence) expected_res = {} self.assertEqual(ngram.gram_frequencies, expected_res)
def test_fill_from_sentence_empty(self): ngram = NGramTrie(2) sentence = () ngram.fill_from_sentence(sentence) expected_res = {} self.assertEqual(ngram.gram_frequencies, expected_res)