Пример #1
0
 def test_get_char_frequencies_simple_text(self):
     text = QUOTE_FROM_TOLSTOY
     result = textstatistics.get_char_frequencies(text)
     expected = {
                 u'Н': 1,
                 u'е': 3,
                 u' ': 11,
                 u'с': 2,
                 u'л': 1,
                 u'у': 3,
                 u'ш': 2,
                 u'а': 2,
                 u'й': 1,
                 u'т': 4,
                 u'х': 3,
                 u',': 1,
                 u'к': 1,
                 u'о': 9,
                 u'г': 2,
                 u'в': 2,
                 u'р': 4,
                 u'и': 3,
                 u'д': 2,
                 u'н': 1,
                 u'.': 1,
                 }
     self.assertDictEqual(result, expected)
Пример #2
0
 def test_get_char_frequencies_simple_text(self):
     text = u'Не слушайте тех, кто говорит дурно о других и хорошо о вас.'
     result = textstatistics.get_char_frequencies(text)
     expected = {
         u'Н': 1,
         u'е': 3,
         u' ': 11,
         u'с': 2,
         u'л': 1,
         u'у': 3,
         u'ш': 2,
         u'а': 2,
         u'й': 1,
         u'т': 4,
         u'х': 3,
         u',': 1,
         u'к': 1,
         u'о': 9,
         u'г': 2,
         u'в': 2,
         u'р': 4,
         u'и': 3,
         u'д': 2,
         u'н': 1,
         u'.': 1,
     }
     self.assertDictEqual(result, expected)
Пример #3
0
 def test_get_char_frequencies_simple_text(self):
     text = QUOTE_FROM_TOLSTOY
     result = textstatistics.get_char_frequencies(text)
     expected = {
         u'Н': 1,
         u'е': 3,
         u' ': 11,
         u'с': 2,
         u'л': 1,
         u'у': 3,
         u'ш': 2,
         u'а': 2,
         u'й': 1,
         u'т': 4,
         u'х': 3,
         u',': 1,
         u'к': 1,
         u'о': 9,
         u'г': 2,
         u'в': 2,
         u'р': 4,
         u'и': 3,
         u'д': 2,
         u'н': 1,
         u'.': 1,
     }
     self.assertDictEqual(result, expected)
Пример #4
0
    def test_evalutate_decoding_complete_eng(self):
        decoded_text = base_text = 'This is a sample text.'

        alphabet = textstatistics.get_char_frequencies(base_text)
        dictionary = textstatistics.get_word_frequencies(base_text)
        language = textstatistics.Languauge(alphabet, dictionary)

        result = decode.evalutate_decoding(decoded_text, language)
        self.assertEqual(result, 1.0)
Пример #5
0
    def test_evalutate_decoding_complete_eng(self):
        decoded_text = base_text = 'This is a sample text.'

        alphabet = textstatistics.get_char_frequencies(base_text)
        dictionary = textstatistics.get_word_frequencies(base_text)
        language = textstatistics.Languauge(alphabet, dictionary)

        result = decode.evalutate_decoding(decoded_text, language)
        self.assertEqual(result, 1.0)
Пример #6
0
    def test_evalutate_decoding_almost_eng(self):
        base_text = 'This is a sample text.'
        decoded_text = 'Thas as i simple text.'

        alphabet = textstatistics.get_char_frequencies(base_text)
        dictionary = textstatistics.get_word_frequencies(base_text)
        language = textstatistics.Languauge(alphabet, dictionary)

        result = decode.evalutate_decoding(decoded_text, language)
        self.assertTrue(result < 1.0 and result > 0.5)
Пример #7
0
    def test_evalutate_decoding_almost_eng(self):
        base_text = 'This is a sample text.'
        decoded_text = 'Thas as i simple text.'

        alphabet = textstatistics.get_char_frequencies(base_text)
        dictionary = textstatistics.get_word_frequencies(base_text)
        language = textstatistics.Languauge(alphabet, dictionary)

        result = decode.evalutate_decoding(decoded_text, language)
        self.assertTrue(result < 1.0 and result > 0.5)
Пример #8
0
    def test_evalutate_decoding_subset_eng(self):
        base_text = 'This is a sample text.'
        base_text_words = textstatistics.split_to_words(base_text)
        decoded_text = ' '.join(base_text_words[:len(base_text_words) // 2])

        alphabet = textstatistics.get_char_frequencies(base_text)
        dictionary = textstatistics.get_word_frequencies(base_text)
        language = textstatistics.Languauge(alphabet, dictionary)

        result = decode.evalutate_decoding(decoded_text, language)
        self.assertEqual(result, 1.0)
Пример #9
0
    def test_evalutate_decoding_subset_eng(self):
        base_text = 'This is a sample text.'
        base_text_words = textstatistics.split_to_words(base_text)
        decoded_text = ' '.join(base_text_words[: len(base_text_words) / 2])

        alphabet = textstatistics.get_char_frequencies(base_text)
        dictionary = textstatistics.get_word_frequencies(base_text)
        language = textstatistics.Languauge(alphabet, dictionary)

        result = decode.evalutate_decoding(decoded_text, language)
        self.assertEqual(result, 1.0)
Пример #10
0
 def test_my_get_char(self):
     text = u'hello world!!!'
     result = textstatistics.get_char_frequencies(text)
     expected = {
         u'h': 1,
         u'e': 1,
         u'l': 3,
         u'o': 2,
         u' ': 1,
         u'w': 1,
         u'r': 1,
         u'd': 1,
         u'!': 3,
     }
     self.assertDictEqual(result, expected)
Пример #11
0
    def setUp(self):
        original_alphabet = list(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя')

        self.original_text = data.QUOTE_FROM_ILF_AND_PETROV
        alphabet = textstatistics.get_char_frequencies(self.original_text)
        alphabet = {char: frequency for (char, frequency) in 
                    alphabet.iteritems() if char in original_alphabet}
        dictionary = textstatistics.get_word_frequencies(self.original_text)
        self.language = textstatistics.Languauge(alphabet, dictionary)

        actual_original_alphabet = alphabet.keys()
        shuffled_alphabet = list(actual_original_alphabet)
        random.seed(1001)
        random.shuffle(shuffled_alphabet)
        self.code = dict(zip(actual_original_alphabet, shuffled_alphabet))
Пример #12
0
    def setUp(self):
        original_alphabet = list(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя')

        self.original_text = data.QUOTE_FROM_ILF_AND_PETROV
        alphabet = textstatistics.get_char_frequencies(self.original_text)
        alphabet = {
            char: frequency
            for (char, frequency) in alphabet.items()
            if char in original_alphabet
        }
        dictionary = textstatistics.get_word_frequencies(self.original_text)
        self.language = textstatistics.Languauge(alphabet, dictionary)

        actual_original_alphabet = alphabet.keys()
        shuffled_alphabet = list(actual_original_alphabet)
        random.seed(1001)
        random.shuffle(shuffled_alphabet)
        self.code = dict(zip(actual_original_alphabet, shuffled_alphabet))
Пример #13
0
 def test_get_char_frequencies_uniform(self):
     text = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя '
     result = textstatistics.get_char_frequencies(text)
     expected = {char: 1 for char in text}
     self.assertDictEqual(result, expected)
Пример #14
0
def decode_text(text, language):
    '''
    Decodes the text encoded with a substitution cipher
    '''
    original_alphabet = list(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя')

    alphabet = language.get_alphabet()
    alphabet = {char: frequency for (char, frequency) in alphabet.items()}
    alphabet_encode_text = textstatistics.get_char_frequencies(text)
    alphabet_encode_text = {
        char: frequency
        for (char, frequency) in alphabet_encode_text.items()
        if char in original_alphabet
    }
    dif_dict = {
        key: 0
        for key in alphabet.keys() if key not in alphabet_encode_text.keys()
    }
    alphabet_encode_text.update(dif_dict)

    list_alphabet = list(alphabet.items())
    list_alphabet.sort(key=lambda i: i[1])
    list_alphabet_encode_text = list(alphabet_encode_text.items())
    list_alphabet_encode_text.sort(key=lambda i: i[1])

    char_to_char = {}
    for (item_list_1, item_list_2) in zip(list_alphabet,
                                          list_alphabet_encode_text):
        char_to_char[item_list_2[0]] = item_list_1[0]

    decoded_text = ''

    for char in text:
        if char in original_alphabet:
            decoded_text += char_to_char[char]
        else:
            decoded_text += char

    word_fitness = evalutate_decoding(decoded_text, language)

    while word_fitness != 1.0:
        old_list = list_alphabet.copy()
        n = 0
        while n < 32:
            n += 1
            i = 0
            old_list = list_alphabet.copy()
            while i < len(list_alphabet) - 1:
                if (i + n) > len(list_alphabet_encode_text) - 1:
                    break
                list_alphabet[i], list_alphabet[i + n] = list_alphabet[
                    i + n], list_alphabet[i]

                char_to_char = {}
                for (item_list_1,
                     item_list_2) in zip(list_alphabet,
                                         list_alphabet_encode_text):
                    char_to_char[item_list_2[0]] = item_list_1[0]

                decoded_text = ''
                for char in text:
                    if char in original_alphabet:
                        decoded_text += char_to_char[char]
                    else:
                        decoded_text += char

                new_word_fitness = evalutate_decoding(decoded_text, language)

                if new_word_fitness > word_fitness:
                    word_fitness = new_word_fitness
                    old_list = list_alphabet.copy()
                    i += 1
                else:
                    list_alphabet = old_list.copy()
                    i += 1

                if word_fitness == 1.0:
                    return decoded_text

    return decoded_text
Пример #15
0
 def test_get_char_frequencies_empty(self):
     text = u''
     result = textstatistics.get_char_frequencies(text)
     expected = {}
     self.assertDictEqual(result, expected)
Пример #16
0
 def test_get_char_frequencies_empty(self):
     text = u''
     result = textstatistics.get_char_frequencies(text)
     expected = {}
     self.assertDictEqual(result, expected)
Пример #17
0
 def test_get_char_frequencies_uniform(self):
     text = u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя '
     result = textstatistics.get_char_frequencies(text)
     expected = {char: 1 for char in text}
     self.assertDictEqual(result, expected)