def spell_check_split_words(self, indices, doc, language, register=-1): """ Split misspelled words based on spell-checker suggestions. Using this is usually not a good idea unless you have an insane dictionary that contains all possible compound words in `language`. Raise :exc:`enchant.Error` if dictionary instatiation fails. """ new_indices = [] new_texts = [] re_multispace = re.compile(r" +") checker = self._get_enchant_checker(language) indices = indices or self.get_all_indices() for index in indices: subtitle = self.subtitles[index] text = subtitle.get_text(doc) text = re_multispace.sub(" ", text) checker.set_text(text) while True: try: next(checker) except StopIteration: break if checker.word.capitalize() == checker.word: # Skip capitalized words, which are usually names # and thus not always found in dictionaries. continue suggestions = [] for i, suggestion in enumerate(checker.suggest()): if suggestion.find(" ") > 0: if suggestion.replace(" ", "") == checker.word: suggestions.append(suggestion) # Split word only if only one two-word suggestion found that # has all the same characters as the original unsplit word. if len(suggestions) != 1: continue text = checker.get_text() a = checker.wordpos z = checker.wordpos + len(checker.word) checker.set_text(text[:a] + suggestions[0] + text[z:]) new_text = checker.get_text() if new_text != text: new_indices.append(index) new_texts.append(new_text) if not new_indices: return self.replace_texts(new_indices, doc, new_texts, register=register) description = _("Splitting words by spell-check suggestions") self.set_action_description(register, description)
def spell_check(pkg, str, fmt, lang, ignored): dict_found = True warned = set() if enchant: if lang == 'C': lang = 'en_US' checker = _enchant_checkers.get(lang) if not checker and lang not in _enchant_checkers: try: checker = enchant.checker.SpellChecker( lang, filters=[enchant.tokenize.EmailFilter, enchant.tokenize.URLFilter, enchant.tokenize.WikiWordFilter]) except enchant.DictNotFoundError: printInfo(pkg, 'enchant-dictionary-not-found', lang) pass _enchant_checkers[lang] = checker if checker: # squeeze whitespace to ease leading context check checker.set_text(re.sub(r'\s+', ' ', str)) if use_utf8: uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper() else: uppername = pkg.name.upper() upperparts = uppername.split('-') if lang.startswith('en'): ups = [x + "'S" for x in upperparts] upperparts.extend(ups) for err in checker: # Skip already warned and ignored words if err.word in warned or err.word in ignored: continue # Skip all capitalized words that do not start a sentence if err.word[0].isupper() and not \ sentence_break_regex.search(checker.leading_context(3)): continue upperword = err.word.upper() # Skip all uppercase words if err.word == upperword: continue # Skip errors containing package name or equal to a # "component" of it, case insensitively if uppername in upperword or upperword in upperparts: continue # Work around enchant's digit tokenizing behavior: # http://github.com/rfk/pyenchant/issues/issue/3 if checker.leading_context(1).isdigit() or \ checker.trailing_context(1).isdigit(): continue # Warn and suggest sug = ', '.join(checker.suggest()[:3]) if sug: sug = '-> %s' % sug printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug) warned.add(err.word) else: dict_found = False if not enchant or not dict_found: for seq in str.split(): for word in re.split(r'[^a-z]+', seq.lower()): if len(word) == 0: continue correct = BAD_WORDS.get(word) if not correct: continue if word[0] == '\'': word = word[1:] if word[-1] == '\'': word = word[:-1] if word in warned or word in ignored: continue printWarning(pkg, 'spelling-error', fmt % lang, word, '->', correct) warned.add(word)
def spell_check(pkg, str, fmt, lang, ignored): dict_found = True warned = set() if enchant: if lang == 'C': lang = 'en_US' checker = _enchant_checkers.get(lang) if not checker and lang not in _enchant_checkers: try: checker = enchant.checker.SpellChecker( lang, filters=[enchant.tokenize.EmailFilter, enchant.tokenize.URLFilter, enchant.tokenize.WikiWordFilter]) except enchant.DictNotFoundError: printInfo(pkg, 'enchant-dictionary-not-found', lang) pass _enchant_checkers[lang] = checker if checker: # squeeze whitespace to ease leading context check checker.set_text(re.sub(r'\s+', ' ', str)) if use_utf8: uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper() else: uppername = pkg.name.upper() upperparts = uppername.split('-') if lang.startswith('en'): ups = [x + "'S" for x in upperparts] upperparts.extend(ups) for err in checker: # Skip already warned and ignored words if err.word in warned or err.word in ignored: continue # Skip all capitalized words that do not start a sentence if err.word[0].isupper() and not \ sentence_break_regex.search(checker.leading_context(3)): continue upperword = err.word.upper() # Skip all uppercase words if err.word == upperword: continue # Skip errors containing package name or equal to a # "component" of it, case insensitively if uppername in upperword or upperword in upperparts: continue # Work around enchant's digit tokenizing behavior: # http://github.com/rfk/pyenchant/issues/issue/3 if checker.leading_context(1).isdigit() or \ checker.trailing_context(1).isdigit(): continue # Warn and suggest sug = ', '.join(checker.suggest()[:3]) if sug: sug = '-> %s' % sug printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug) warned.add(err.word) else: dict_found = False if not enchant or not dict_found: for seq in str.split(): for word in re.split('[^a-z]+', seq.lower()): if len(word) == 0: continue correct = BAD_WORDS.get(word) if not correct: continue if word[0] == '\'': word = word[1:] if word[-1] == '\'': word = word[:-1] if word in warned or word in ignored: continue printWarning(pkg, 'spelling-error', fmt % lang, word, '->', correct) warned.add(word)