示例#1
0
 def test_split_to_words_russian(self):
     text = QUOTE_FROM_TOLSTOY
     expected = [
         u'Не', u'слушайте', u'тех', u'кто', u'говорит', u'дурно', u'о',
         u'других', u'и', u'хорошо', u'о', u'вас'
     ]
     result = textstatistics.split_to_words(text)
     self.assertEqual(result, expected)
示例#2
0
    def test_evalutate_decoding_subset_eng(self):
        base_text = 'This is a sample text.'
        base_text_words = textstatistics.split_to_words(base_text)
        decoded_text = ' '.join(base_text_words[:len(base_text_words) // 2])

        alphabet = textstatistics.get_char_frequencies(base_text)
        dictionary = textstatistics.get_word_frequencies(base_text)
        language = textstatistics.Languauge(alphabet, dictionary)

        result = decode.evalutate_decoding(decoded_text, language)
        self.assertEqual(result, 1.0)
示例#3
0
    def test_evalutate_decoding_subset_eng(self):
        base_text = 'This is a sample text.'
        base_text_words = textstatistics.split_to_words(base_text)
        decoded_text = ' '.join(base_text_words[: len(base_text_words) / 2])

        alphabet = textstatistics.get_char_frequencies(base_text)
        dictionary = textstatistics.get_word_frequencies(base_text)
        language = textstatistics.Languauge(alphabet, dictionary)

        result = decode.evalutate_decoding(decoded_text, language)
        self.assertEqual(result, 1.0)
示例#4
0
def evalutate_decoding(text, language):
    '''
    Evaluates how the decoded text corresponds to the language.
    Returns estimated fitness as a float value from the range [0; 1], where
    0 means doesn't correspond at all,
    1 means all words are correct.
    '''
    fitness_sum = 0.0
    words = textstatistics.split_to_words(text)
    for word in words:
        fitness_sum += language.word_fitness(word)
    return fitness_sum / len(words)
示例#5
0
def evalutate_decoding(text, language):
    '''
    Evaluates how the decoded text corresponds to the language.
    Returns estimated fitness as a float value from the range [0; 1], where
    0 means doesn't correspond at all,
    1 means all words are correct.
    '''
    fitness_sum = 0.0
    words = textstatistics.split_to_words(text)
    for word in words:
        fitness_sum += language.word_fitness(word)
    return fitness_sum / len(words)
示例#6
0
 def test_split_to_words_russian(self):
     text = QUOTE_FROM_TOLSTOY
     expected = [
                 u'Не',
                 u'слушайте',
                 u'тех',
                 u'кто',
                 u'говорит',
                 u'дурно',
                 u'о',
                 u'других',
                 u'и',
                 u'хорошо',
                 u'о',
                 u'вас'
                 ]
     result = textstatistics.split_to_words(text)
     self.assertEqual(result, expected)
示例#7
0
 def test_split_to_words_english(self):
     text = QUOTE_FROM_SHAKESPEARE
     expected = ['To', 'be', 'or', 'not', 'to', 'be']
     result = textstatistics.split_to_words(text)
     self.assertEqual(result, expected)
示例#8
0
 def test_split_to_words_english(self):
     text = QUOTE_FROM_SHAKESPEARE
     expected = ['To', 'be', 'or', 'not', 'to', 'be']
     result = textstatistics.split_to_words(text)
     self.assertEqual(result, expected)