def load_relationship_correction(csv_data, edit_distance=5, prefix_length=7): with open(csv_data, 'rb') as f: data = f.readlines() data = [ ctr.decode("utf-8").replace('\r\n', '').lower().replace(' ', '_') for ctr in data ] data = [decompound_unicode(ctr) for ctr in data] uni = export_freq_dic([data]) sym_spell = SymSpell(max_dictionary_edit_distance=edit_distance, prefix_length=prefix_length) sym_spell.load_dictionary_from_list(uni, term_index=0, count_index=1) return sym_spell
def load_date_correction(csv_data=None, edit_distance=2, prefix_length=4): data = csv_data data = [str(i) for i in data] data.append(' - ') c = collections.Counter() for i in data: c.update({i}) unigram = [] for i in c: unigram.append([i, str(c[i])]) sym_spell = SymSpell(max_dictionary_edit_distance=edit_distance, prefix_length=prefix_length) sym_spell.load_dictionary_from_list(unigram, term_index=0, count_index=1) return sym_spell
def load_cpn_corection(companies_list, debug=False): with open(companies_list, 'r', encoding='utf-8') as f: l = f.read() l = l.lower() l = l.split('\n') m = [] for w in l: m.append(w.split()) bi = export_freq_bigram(m) uni = export_freq_dic(m) if debug: print(uni) print(bi) sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7) sym_spell.load_dictionary_from_list(uni, term_index=0, count_index=1) sym_spell.load_bigram_dictionary_from_list(bi, term_index=0, count_index=2) return sym_spell
def load_country_correction(csv_data, edit_distance=5, prefix_length=7): with open(csv_data, 'rb') as f: data = f.readlines() data = [ ctr.decode("utf-8").replace('\r\n', '').lower().replace(' ', '_') for ctr in data ] c = collections.Counter() for i in data: c.update({i}) unigram = [] for i in c: if i == 'việt_nam': c[i] = 1000 unigram.append([i, str(c[i])]) sym_spell = SymSpell(max_dictionary_edit_distance=edit_distance, prefix_length=prefix_length) sym_spell.load_dictionary_from_list(unigram, term_index=0, count_index=1) return sym_spell