def test_split_to_words_russian(self): text = QUOTE_FROM_TOLSTOY expected = [ u'Не', u'слушайте', u'тех', u'кто', u'говорит', u'дурно', u'о', u'других', u'и', u'хорошо', u'о', u'вас' ] result = textstatistics.split_to_words(text) self.assertEqual(result, expected)
def test_evalutate_decoding_subset_eng(self): base_text = 'This is a sample text.' base_text_words = textstatistics.split_to_words(base_text) decoded_text = ' '.join(base_text_words[:len(base_text_words) // 2]) alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertEqual(result, 1.0)
def test_evalutate_decoding_subset_eng(self): base_text = 'This is a sample text.' base_text_words = textstatistics.split_to_words(base_text) decoded_text = ' '.join(base_text_words[: len(base_text_words) / 2]) alphabet = textstatistics.get_char_frequencies(base_text) dictionary = textstatistics.get_word_frequencies(base_text) language = textstatistics.Languauge(alphabet, dictionary) result = decode.evalutate_decoding(decoded_text, language) self.assertEqual(result, 1.0)
def evalutate_decoding(text, language): ''' Evaluates how the decoded text corresponds to the language. Returns estimated fitness as a float value from the range [0; 1], where 0 means doesn't correspond at all, 1 means all words are correct. ''' fitness_sum = 0.0 words = textstatistics.split_to_words(text) for word in words: fitness_sum += language.word_fitness(word) return fitness_sum / len(words)
def test_split_to_words_english(self): text = QUOTE_FROM_SHAKESPEARE expected = ['To', 'be', 'or', 'not', 'to', 'be'] result = textstatistics.split_to_words(text) self.assertEqual(result, expected)