def aspell_clean(): starts_with_cap = re.compile('^[A-Z]') lang = get_lang() checker = spell_checker.AspellSpellChecker(lang) bad_word_ctr = Counter() for fn in os.listdir('text/clean'): if fn.endswith('.txt'): for bad_word in checker.check_document('text/clean/{}'.format(fn)): bad_word_ctr[bad_word] += 1 with codecs.open('working/maybe_ok.txt', mode='wb', encoding='utf-8') as f: for bad_word, count in bad_word_ctr.most_common(): if starts_with_cap.match(bad_word): f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word), count)) for bad_word, count in bad_word_ctr.most_common(): if not starts_with_cap.match(bad_word): f.write(u'{} ({})\n'.format(spell_checker._decode(bad_word), count))
def make_word_fix_doc(self, dir_): checker = self.spell_checker bad_words = set() bad_bad_map = {} word_set = set() for fn in os.listdir(dir_): if fn.endswith('.txt'): with codecs.open('{}/{}'.format(dir_, fn), mode='rb', encoding='utf-8') as f: for l in f: word_set.update(l.split()) print '{} words'.format(len(word_set)) words = list(word_set) for idx, bad_word_key in enumerate(self.spell_checker.failed_words(words)): if bad_word_key: bad_word = words[idx] bad_words.add(spell_checker._decode(bad_word)) bad_bad_map[words[idx]] = bad_word print '{} bad words'.format(len(bad_words)) fixes = self.fixed_words(bad_words) print '{} fixes'.format(len(fixes)) still_bad = Counter() solos = [] multis = [] for bad_version, bad_word in sorted(bad_bad_map.items(), key=lambda x: x[0]): try: good_versions = fixes[bad_word] # remove hyphened versions if others are present if not '-' in bad_word: good_unhyphened = [word for word in good_versions if not '-' in word] if good_unhyphened: good_versions = good_unhyphened fixed_good_versions = [] for version in good_versions: fixed_good_versions.append(bad_version.replace(bad_word, version)) if len(fixed_good_versions) > 1: multis.append((bad_version, fixed_good_versions,)) elif fixed_good_versions: solos.append((bad_version, fixed_good_versions,)) except KeyError: still_bad[bad_word] += 1 with codecs.open('{}/word_fixes.txt'.format(self.output_dir), mode='wb', encoding='utf-8') as f: for bad_version, fixed_good_versions in multis: f.write(u'{}|{}\n'.format(bad_version, self.delimiter.join(fixed_good_versions))) for bad_version, fixed_good_versions in solos: f.write(u'{}|{}\n'.format(bad_version, self.delimiter.join(fixed_good_versions))) with codecs.open('{}/bad_words.txt'.format(self.output_dir), mode='wb', encoding='utf-8') as f: for bad_word, cnt in still_bad.most_common(): f.write(u'{:>20}: {:>3}\n'.format(bad_word, cnt))
def make_word_fix_doc(self, dir_): checker = self.spell_checker bad_words = set() bad_bad_map = {} word_set = set() for fn in os.listdir(dir_): if fn.endswith('.txt'): with codecs.open('{}/{}'.format(dir_, fn), mode='rb', encoding='utf-8') as f: for l in f: word_set.update(l.split()) print '{} words'.format(len(word_set)) words = list(word_set) for idx, bad_word_key in enumerate( self.spell_checker.failed_words(words)): if bad_word_key: bad_word = words[idx] bad_words.add(spell_checker._decode(bad_word)) bad_bad_map[words[idx]] = bad_word print '{} bad words'.format(len(bad_words)) fixes = self.fixed_words(bad_words) print '{} fixes'.format(len(fixes)) still_bad = Counter() solos = [] multis = [] for bad_version, bad_word in sorted(bad_bad_map.items(), key=lambda x: x[0]): try: good_versions = fixes[bad_word] # remove hyphened versions if others are present if not '-' in bad_word: good_unhyphened = [ word for word in good_versions if not '-' in word ] if good_unhyphened: good_versions = good_unhyphened fixed_good_versions = [] for version in good_versions: fixed_good_versions.append( bad_version.replace(bad_word, version)) if len(fixed_good_versions) > 1: multis.append(( bad_version, fixed_good_versions, )) elif fixed_good_versions: solos.append(( bad_version, fixed_good_versions, )) except KeyError: still_bad[bad_word] += 1 with codecs.open('{}/word_fixes.txt'.format(self.output_dir), mode='wb', encoding='utf-8') as f: for bad_version, fixed_good_versions in multis: f.write(u'{}|{}\n'.format( bad_version, self.delimiter.join(fixed_good_versions))) for bad_version, fixed_good_versions in solos: f.write(u'{}|{}\n'.format( bad_version, self.delimiter.join(fixed_good_versions))) with codecs.open('{}/bad_words.txt'.format(self.output_dir), mode='wb', encoding='utf-8') as f: for bad_word, cnt in still_bad.most_common(): f.write(u'{:>20}: {:>3}\n'.format(bad_word, cnt))