示例#1
0
    def test_replace_not_allowed(self):
        alphabet_en_space_nums = get_alphabet('en') + ' 0123456789'
        alphabet_de_space_nums = get_alphabet('de') + ' 0123456789'

        assert_that(
            replace_not_allowed('färöer strasse', alphabet_en_space_nums),
            is_('f r er strasse'))
        assert_that(
            replace_not_allowed('färöer strasse', alphabet_en_space_nums, '#'),
            is_('f#r#er strasse'))
        assert_that(
            replace_not_allowed('färöer strasse', alphabet_de_space_nums),
            is_('färöer strasse'))
        assert_that(
            replace_not_allowed('färöer strasse', alphabet_de_space_nums, '#'),
            is_('färöer strasse'))
示例#2
0
 def test_unidecode_with_alphabet(self):
     ascii_chars = string.ascii_lowercase
     ascii_chars_with_umlauts = string.ascii_lowercase + 'ÄäÖöÜü'
     assert_that(unidecode_with_alphabet('Färöer Straße', ascii_chars),
                 is_('Faroer Strasse'))
     assert_that(
         unidecode_with_alphabet('Färöer Straße', ascii_chars_with_umlauts),
         is_('Färöer Strasse'))
     assert_that(
         unidecode_with_alphabet("won't doesn't Straße",
                                 get_alphabet('en')),
         is_("won't doesn't Strasse"))
     assert_that(
         unidecode_with_alphabet("won't doesn't Straße",
                                 get_alphabet('de')),
         is_("won't doesn't Strasse"))
示例#3
0
def process_sentence(sent, language, min_words=0):
    alphabet = get_alphabet(language)
    words = [
        normalize_word(word, alphabet)
        for word in nltk.word_tokenize(sent, language=language)
    ]
    if len(words) >= min_words:
        return ' '.join(w for w in words
                        if w).strip()  # prevent multiple spaces
    return ''
示例#4
0
def correction(sentence, language, lm=None, vocab=None):
    """
    Get most probable spelling correction for a given sentence.
    :param sentence:
    :param language: the language of the sentence
    :param lm: n-gram LM to use to score sentence
    :param vocab: vocabulary of LM to use for spell checking
    :return:
    """
    assert language in ['en', 'de', 'fr', 'it', 'es'], 'language must be one of [\'en\', \'de\']'
    if not lm or not vocab:
        return sentence
    alphabet = get_alphabet(language)
    beam_width = 1024
    layer = [(0, [])]  # list of (score, 2-gram)-pairs
    for word in sentence.split():
        layer = [(-score(node + [word_c], lm), node + [word_c])
                 for word_c in candidate_words(word, vocab, alphabet)
                 for sc, node in layer]
        heapify(layer)
        layer = layer[:beam_width]
    return ' '.join(layer[0][1])
示例#5
0
 def test_edits_1(self):
     alphabet = get_alphabet('en')
     result = lm_util.edits_1('abc', alphabet)
     # 3 deletes + 2 swaps + 3*27 replaces + 4*27- insert
     assert_that(len(list(result)), is_(3 + 2 + 81 + 108))
示例#6
0
 def test_inserts(self):
     alphabet = get_alphabet('en')
     result = lm_util.inserts('abc', alphabet)
     # 4*26 = 104 inserts: aabc, babc, ..., aabc, abbc, acbc, ..., abac, abbc, abcc, abdc, ..., abca, abcb, abcc, ...
     assert_that(len(list(result)), is_(108))
示例#7
0
 def test_replaces(self):
     alphabet = get_alphabet('en')
     result = lm_util.replaces('abc', alphabet)
     # 3*26 = 78 replaces: bbc, cbc, dbc, ..., aac, acc, adc, ... aba, abb, abd, ...
     assert_that(len(list(result)), is_(81))
示例#8
0
 def test_get_alphabet(self):
     assert_that(len(get_alphabet('en')), is_(27))  # a..z, '
     assert_that(len(get_alphabet('de')), is_(29))  # a..z, ä,ö,ü
示例#9
0
def normalize(text, language):
    text = text.strip().lower()
    alphabet = get_alphabet(language)
    text = unidecode_with_alphabet(text, alphabet)
    text = replace_not_allowed(text, alphabet + ' 0123456789')
    return remove_multi_spaces(text)
示例#10
0
def normalize_sentence(sentence, language):
    return remove_multi_spaces(
        unidecode_with_alphabet(remove_punctuation(replace_numeric(sentence, by_single_digit=True)),
                                get_alphabet(language)))