def test_replace_not_allowed(self): alphabet_en_space_nums = get_alphabet('en') + ' 0123456789' alphabet_de_space_nums = get_alphabet('de') + ' 0123456789' assert_that( replace_not_allowed('färöer strasse', alphabet_en_space_nums), is_('f r er strasse')) assert_that( replace_not_allowed('färöer strasse', alphabet_en_space_nums, '#'), is_('f#r#er strasse')) assert_that( replace_not_allowed('färöer strasse', alphabet_de_space_nums), is_('färöer strasse')) assert_that( replace_not_allowed('färöer strasse', alphabet_de_space_nums, '#'), is_('färöer strasse'))
def test_unidecode_with_alphabet(self): ascii_chars = string.ascii_lowercase ascii_chars_with_umlauts = string.ascii_lowercase + 'ÄäÖöÜü' assert_that(unidecode_with_alphabet('Färöer Straße', ascii_chars), is_('Faroer Strasse')) assert_that( unidecode_with_alphabet('Färöer Straße', ascii_chars_with_umlauts), is_('Färöer Strasse')) assert_that( unidecode_with_alphabet("won't doesn't Straße", get_alphabet('en')), is_("won't doesn't Strasse")) assert_that( unidecode_with_alphabet("won't doesn't Straße", get_alphabet('de')), is_("won't doesn't Strasse"))
def process_sentence(sent, language, min_words=0): alphabet = get_alphabet(language) words = [ normalize_word(word, alphabet) for word in nltk.word_tokenize(sent, language=language) ] if len(words) >= min_words: return ' '.join(w for w in words if w).strip() # prevent multiple spaces return ''
def correction(sentence, language, lm=None, vocab=None): """ Get most probable spelling correction for a given sentence. :param sentence: :param language: the language of the sentence :param lm: n-gram LM to use to score sentence :param vocab: vocabulary of LM to use for spell checking :return: """ assert language in ['en', 'de', 'fr', 'it', 'es'], 'language must be one of [\'en\', \'de\']' if not lm or not vocab: return sentence alphabet = get_alphabet(language) beam_width = 1024 layer = [(0, [])] # list of (score, 2-gram)-pairs for word in sentence.split(): layer = [(-score(node + [word_c], lm), node + [word_c]) for word_c in candidate_words(word, vocab, alphabet) for sc, node in layer] heapify(layer) layer = layer[:beam_width] return ' '.join(layer[0][1])
def test_edits_1(self): alphabet = get_alphabet('en') result = lm_util.edits_1('abc', alphabet) # 3 deletes + 2 swaps + 3*27 replaces + 4*27- insert assert_that(len(list(result)), is_(3 + 2 + 81 + 108))
def test_inserts(self): alphabet = get_alphabet('en') result = lm_util.inserts('abc', alphabet) # 4*26 = 104 inserts: aabc, babc, ..., aabc, abbc, acbc, ..., abac, abbc, abcc, abdc, ..., abca, abcb, abcc, ... assert_that(len(list(result)), is_(108))
def test_replaces(self): alphabet = get_alphabet('en') result = lm_util.replaces('abc', alphabet) # 3*26 = 78 replaces: bbc, cbc, dbc, ..., aac, acc, adc, ... aba, abb, abd, ... assert_that(len(list(result)), is_(81))
def test_get_alphabet(self): assert_that(len(get_alphabet('en')), is_(27)) # a..z, ' assert_that(len(get_alphabet('de')), is_(29)) # a..z, ä,ö,ü
def normalize(text, language): text = text.strip().lower() alphabet = get_alphabet(language) text = unidecode_with_alphabet(text, alphabet) text = replace_not_allowed(text, alphabet + ' 0123456789') return remove_multi_spaces(text)
def normalize_sentence(sentence, language): return remove_multi_spaces( unidecode_with_alphabet(remove_punctuation(replace_numeric(sentence, by_single_digit=True)), get_alphabet(language)))