def initialize(): chk_file = raw_input("What is the file to spellcheck? ") field = raw_input("What FIELD do you want to spellcheck? ") s_file = raw_input("What is name of final file? ") checker = enchant.checker.SpellChecker("en_US") cmdln = CmdLineChecker() file_data = pd.read_csv(chk_file) fields = list(file_data.apply(lambda x:'%s' % (x[field]),axis=1)) # maybe i don't even need this... #fields = strip_html(fields) corrected_text = [] for data_field in fields: checker.set_text(str(data_field)) for err in checker: print err.word print err.suggest() correct = raw_input("provide 0-index int of correct word or i to ignore, e to edit ") if correct == 'i': pass elif correct == 'e': suggest = raw_input("") err.replace(suggest) else: correct = int(correct) suggest = err.suggest()[correct] err.replace(suggest) corrected_text.append(checker.get_text()) saved_file = write_fixed_file(corrected_text, s_file)
def do_check(checker,to_check): for text in to_check: checker.set_text(text) cmdline_checker = CmdLineChecker() cmdline_checker.set_checker(checker) cmdline_checker.run() to_check[to_check.index(text)] = checker.get_text()
def spellcheck(self, text, tld=''): from scanner.models import BadWord # guess language code self.log.debug(' * guessing language...') #lang_code, lang_num, lang_name = guess_language.guessLanguageInfo(text) lang_name, lang_code, reliable, bytes_found, details = \ cld.detect(text.encode('utf-8'), hintTopLevelDomain=tld) self.log.debug(' -> detected lang: %s (%s)' % (lang_name, lang_code)) if lang_code.upper() == 'UNKNOWN' or lang_name.upper( ) == 'UNKNOWN' or not reliable: self.log.warning( ' -> Cannot detect language of page - end : %s' % details) return None, set() self.log.debug(' * searching for dictionary') try: checker = enchant.checker.SpellChecker( lang_code, filters=[ EmailFilter, URLFilter, # BetterURLFilter, ]) except enchant.DictNotFoundError: if lang_code in self.not_supported_lang: self.log.debug( " -> Cannot find language for spellchecker for %s - end (blacklisted)" % lang_code) else: self.log.error( " -> Cannot find language for spellchecker for %s - end" % lang_code) return None, set() # checking page for bad words self.log.debug(' * check spelling...') checker.set_text(text) self.log.debug(' -> ok') self.log.debug(' * get errors...') errors = [er.word for er in checker if len(er.word) < 128] self.log.debug(' -> ok') self.log.debug(' * found %d bad words and adding them to DB' % len(errors)) BadWord.objects.bulk_create( [BadWord(word=bad_word.strip().lower()) for bad_word in errors]) self.log.debug(' -> ok') self.log.debug(' * call filtering bad words') errors = BadWord.filter_bad_words(errors) self.log.debug(' -> ok') self.log.debug(' * after filtering out there is %d errors (%s)' % (len(errors), errors)) return lang_name, set(errors)
def spell_check_split_words(self, indices, doc, language, register=-1): """ Split misspelled words based on spell-checker suggestions. Using this is usually not a good idea unless you have an insane dictionary that contains all possible compound words in `language`. Raise :exc:`enchant.Error` if dictionary instatiation fails. """ new_indices = [] new_texts = [] re_multispace = re.compile(r" +") checker = self._get_enchant_checker(language) indices = indices or self.get_all_indices() for index in indices: subtitle = self.subtitles[index] text = subtitle.get_text(doc) text = re_multispace.sub(" ", text) checker.set_text(text) while True: try: next(checker) except StopIteration: break if checker.word.capitalize() == checker.word: # Skip capitalized words, which are usually names # and thus not always found in dictionaries. continue suggestions = [] for i, suggestion in enumerate(checker.suggest()): if suggestion.find(" ") > 0: if suggestion.replace(" ", "") == checker.word: suggestions.append(suggestion) # Split word only if only one two-word suggestion found that # has all the same characters as the original unsplit word. if len(suggestions) != 1: continue text = checker.get_text() a = checker.wordpos z = checker.wordpos + len(checker.word) checker.set_text(text[:a] + suggestions[0] + text[z:]) new_text = checker.get_text() if new_text != text: new_indices.append(index) new_texts.append(new_text) if not new_indices: return self.replace_texts(new_indices, doc, new_texts, register=register) description = _("Splitting words by spell-check suggestions") self.set_action_description(register, description)
def spell_check_join_words(self, indices, doc, language, register=-1): """ Join misspelled words based on spell-checker suggestions. Raise :exc:`enchant.Error` if dictionary instatiation fails. """ new_indices = [] new_texts = [] re_multispace = re.compile(r" +") checker = self._get_enchant_checker(language) seeker = self._get_enchant_checker(language) for index in indices or self.get_all_indices(): subtitle = self.subtitles[index] text = subtitle.get_text(doc) text = re_multispace.sub(" ", text) checker.set_text(text) while True: try: next(checker) except StopIteration: break text = checker.get_text() a = checker.wordpos z = checker.wordpos + len(checker.word) ok_with_prev = ok_with_next = False if checker.leading_context(1) == " ": seeker.set_text(text[:a - 1] + text[a:]) poss = self._get_misspelled_indices(seeker) ok_with_prev = not (a - 1) in poss if checker.trailing_context(1) == " ": seeker.set_text(text[:z] + text[z + 1:]) poss = self._get_misspelled_indices(seeker) ok_with_next = not a in poss # Join backwards or forwards if only one direction, # but not both, produce a correctly spelled result. if ok_with_prev and not ok_with_next: checker.set_text(text[:a - 1] + text[a:]) if ok_with_next and not ok_with_prev: checker.set_text(text[:z] + text[z + 1:]) new_text = checker.get_text() if new_text != text: new_indices.append(index) new_texts.append(new_text) if not new_indices: return self.replace_texts(new_indices, doc, new_texts, register=register) description = _("Joining words by spell-check suggestions") self.set_action_description(register, description)
def spell_check(pkg, str, fmt, lang, ignored): dict_found = True warned = set() if enchant: if lang == 'C': lang = 'en_US' checker = _enchant_checkers.get(lang) if not checker and lang not in _enchant_checkers: try: checker = enchant.checker.SpellChecker( lang, filters=[enchant.tokenize.EmailFilter, enchant.tokenize.URLFilter, enchant.tokenize.WikiWordFilter]) except enchant.DictNotFoundError: printInfo(pkg, 'enchant-dictionary-not-found', lang) pass _enchant_checkers[lang] = checker if checker: # squeeze whitespace to ease leading context check checker.set_text(re.sub(r'\s+', ' ', str)) if use_utf8: uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper() else: uppername = pkg.name.upper() upperparts = uppername.split('-') if lang.startswith('en'): ups = [x + "'S" for x in upperparts] upperparts.extend(ups) for err in checker: # Skip already warned and ignored words if err.word in warned or err.word in ignored: continue # Skip all capitalized words that do not start a sentence if err.word[0].isupper() and not \ sentence_break_regex.search(checker.leading_context(3)): continue upperword = err.word.upper() # Skip all uppercase words if err.word == upperword: continue # Skip errors containing package name or equal to a # "component" of it, case insensitively if uppername in upperword or upperword in upperparts: continue # Work around enchant's digit tokenizing behavior: # http://github.com/rfk/pyenchant/issues/issue/3 if checker.leading_context(1).isdigit() or \ checker.trailing_context(1).isdigit(): continue # Warn and suggest sug = ', '.join(checker.suggest()[:3]) if sug: sug = '-> %s' % sug printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug) warned.add(err.word) else: dict_found = False if not enchant or not dict_found: for seq in str.split(): for word in re.split(r'[^a-z]+', seq.lower()): if len(word) == 0: continue correct = BAD_WORDS.get(word) if not correct: continue if word[0] == '\'': word = word[1:] if word[-1] == '\'': word = word[:-1] if word in warned or word in ignored: continue printWarning(pkg, 'spelling-error', fmt % lang, word, '->', correct) warned.add(word)
def spell_check(pkg, str, fmt, lang, ignored): dict_found = True warned = set() if enchant: if lang == 'C': lang = 'en_US' checker = _enchant_checkers.get(lang) if not checker and lang not in _enchant_checkers: try: checker = enchant.checker.SpellChecker( lang, filters=[enchant.tokenize.EmailFilter, enchant.tokenize.URLFilter, enchant.tokenize.WikiWordFilter]) except enchant.DictNotFoundError: printInfo(pkg, 'enchant-dictionary-not-found', lang) pass _enchant_checkers[lang] = checker if checker: # squeeze whitespace to ease leading context check checker.set_text(re.sub(r'\s+', ' ', str)) if use_utf8: uppername = Pkg.to_unicode(pkg.header[rpm.RPMTAG_NAME]).upper() else: uppername = pkg.name.upper() upperparts = uppername.split('-') if lang.startswith('en'): ups = [x + "'S" for x in upperparts] upperparts.extend(ups) for err in checker: # Skip already warned and ignored words if err.word in warned or err.word in ignored: continue # Skip all capitalized words that do not start a sentence if err.word[0].isupper() and not \ sentence_break_regex.search(checker.leading_context(3)): continue upperword = err.word.upper() # Skip all uppercase words if err.word == upperword: continue # Skip errors containing package name or equal to a # "component" of it, case insensitively if uppername in upperword or upperword in upperparts: continue # Work around enchant's digit tokenizing behavior: # http://github.com/rfk/pyenchant/issues/issue/3 if checker.leading_context(1).isdigit() or \ checker.trailing_context(1).isdigit(): continue # Warn and suggest sug = ', '.join(checker.suggest()[:3]) if sug: sug = '-> %s' % sug printWarning(pkg, 'spelling-error', fmt % lang, err.word, sug) warned.add(err.word) else: dict_found = False if not enchant or not dict_found: for seq in str.split(): for word in re.split('[^a-z]+', seq.lower()): if len(word) == 0: continue correct = BAD_WORDS.get(word) if not correct: continue if word[0] == '\'': word = word[1:] if word[-1] == '\'': word = word[:-1] if word in warned or word in ignored: continue printWarning(pkg, 'spelling-error', fmt % lang, word, '->', correct) warned.add(word)
def check_lang(t): code, num, name = guess_language.guessLanguageInfo(t) checker = enchant.checker.SpellChecker(code, filters=[EmailFilter, URLFilter]) checker.set_text(t) return code, [x.word for x in checker]