def hangle_test(): from soynlp.hangle import normalize from soynlp.hangle import compose from soynlp.hangle import decompose from soynlp.hangle import character_is_korean from soynlp.hangle import character_is_jaum from soynlp.hangle import character_is_moum from soynlp.hangle import to_base from soynlp.hangle import levenshtein from soynlp.hangle import jamo_levenshtein normalized_ = normalize('123이건테스트ab테스트') if not (normalized_ == '이건테스트 테스트'): raise ValueError('{} should be 이건테스트 테스트'.format(normalized_)) if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')): raise ValueError('decompose("간") -> {}'.format(decompose('간'))) if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')): raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ'))) if not (('ㅋ', ' ', ' ') == decompose('ㅋ')): raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ'))) if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')): raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(compose('ㄱ', 'ㅏ', 'ㅁ'))) if not character_is_korean('감'): raise ValueError('character_is_korean("감") -> {}'.format(character_is_korean('감'))) if character_is_korean('a'): raise ValueError('character_is_korean("a") -> {}'.format(character_is_korean('a'))) if not character_is_jaum('ㅋ'): raise ValueError('character_is_jaum("ㅋ") -> {}'.format(character_is_jaum('ㅋ'))) if character_is_jaum('a'): raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a'))) if not character_is_moum('ㅗ'): raise ValueError('character_is_jaum("ㅗ") -> {}'.format(character_is_jaum('ㅗ'))) if character_is_moum('a'): raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a'))) if not (to_base('ㄱ') == 12593): raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ'))) if 1 != levenshtein('가나', '가남'): raise ValueError("levenshtein('가나', '가남') -> {}".format(levenshtein('가나', '가남'))) if 0.1 != levenshtein('가나', '가남', {('나', '남'):0.1}): raise ValueError("levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(levenshtein('가나', '가남', {('나', '남'):0.1}))) if 1/3 != jamo_levenshtein('가나', '가남'): raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(jamo_levenshtein('가나', '가남'))) print('all hangle tests have been successed\n\n')
def get_correction(test_word): words_ed1 = get_ed1_words(test_word) test_words = words_ed1 + [test_word] wrong_words = [] for test_ in test_words: if word_count_ed[0][test_] ==0: wrong_words.append(test_) print(wrong_words) if not wrong_words : return test_word longest_word = sorted(wrong_words, key=lambda x : len(x), reverse=True )[0] length = len(longest_word) for char in longest_word: have_char = [True if x.find(char)!=-1 else False for x in wrong_words] if sum(have_char)==length: typo = char typo_idx = test_word.index(typo) pre_word = test_word[:typo_idx] post_word = test_word[typo_idx+1:] exception_word = pre_word + post_word for product in word_count[0].keys(): if levenshtein(exception_word, product) ==1: correction_word = product return correction_word return test_word
def search_book(self, search_text, n=10): search_table = self.book_table.copy() search_table["JamoEditDis"] = [soyh.jamo_levenshtein(t, search_text) for t in search_table["BookTitle"].tolist()] search_table["levEditDis"] = [soyh.levenshtein(t, search_text) for t in search_table["BookTitle"].tolist()] search_table["EditDis"] = np.mean(search_table[["JamoEditDis", "levEditDis"]], axis=1) search_table = search_table.sort_values("EditDis") return search_table.head(n)
def get_editdistance1(combination, bigram_lexicon): ed1_word = [] for lexicon_word in bigram_lexicon: ed = levenshtein(combination, lexicon_word) if ed==0: return [combination] elif ed<=2: ed1_word.append(lexicon_word) return ed1_word
start_time = time.time() ed_list = [] eps = 10e-4 dynamic_dict = defaultdict(Counter) for word in lexicon['preprocess']: min_ed = [] word_split = word.split('_') for idx, each in enumerate(example): each_ed = [] for word_ in word_split: if dynamic_dict[idx][word_] !=0: er = dynamic_dict[idx][word_] each_ed.append(er) else: ed = levenshtein(each.lower(), word_.lower()) er = 1- (ed/(max(len(word_), len(each)))) if er <= 0.4: er = 0 + eps # print(f'"{each.lower()}"와 "{word.lower()}"속 단어 "{word_.lower()}" 의 raw ED 값 : {ed}, 1- levenshtein 값 : {er}') # print(f'"{each.lower()}"와 "{word_.lower()}" 의 raw ED 값 : {ed}, 1- levenshtein 값 : {er}') each_ed.append(er) dynamic_dict[idx][word_] = er ### 2021-01-28 추가 ## if er == 1.0: break # min_ed.append(ed_ratio[idx] * max(each_ed)) min_ed.append(total_ratio[idx] * max(each_ed)) # min_ed.append(min(each_ed)) ed_list.append(sum(min_ed)/len(example))
def hangle_test(): from soynlp.hangle import normalize from soynlp.hangle import compose from soynlp.hangle import decompose from soynlp.hangle import character_is_korean from soynlp.hangle import character_is_jaum from soynlp.hangle import character_is_moum from soynlp.hangle import to_base from soynlp.hangle import levenshtein from soynlp.hangle import jamo_levenshtein normalized_ = normalize('123이건테스트ab테스트') if not (normalized_ == '이건테스트 테스트'): raise ValueError('{} should be 이건테스트 테스트'.format(normalized_)) if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')): raise ValueError('decompose("간") -> {}'.format(decompose('간'))) if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')): raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ'))) if not (('ㅋ', ' ', ' ') == decompose('ㅋ')): raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ'))) if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')): raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format( compose('ㄱ', 'ㅏ', 'ㅁ'))) if not character_is_korean('감'): raise ValueError('character_is_korean("감") -> {}'.format( character_is_korean('감'))) if character_is_korean('a'): raise ValueError('character_is_korean("a") -> {}'.format( character_is_korean('a'))) if not character_is_jaum('ㅋ'): raise ValueError('character_is_jaum("ㅋ") -> {}'.format( character_is_jaum('ㅋ'))) if character_is_jaum('a'): raise ValueError('character_is_jaum("a") -> {}'.format( character_is_jaum('a'))) if not character_is_moum('ㅗ'): raise ValueError('character_is_jaum("ㅗ") -> {}'.format( character_is_jaum('ㅗ'))) if character_is_moum('a'): raise ValueError('character_is_jaum("a") -> {}'.format( character_is_jaum('a'))) if not (to_base('ㄱ') == 12593): raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ'))) if 1 != levenshtein('가나', '가남'): raise ValueError("levenshtein('가나', '가남') -> {}".format( levenshtein('가나', '가남'))) if 0.1 != levenshtein('가나', '가남', {('나', '남'): 0.1}): raise ValueError( "levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format( levenshtein('가나', '가남', {('나', '남'): 0.1}))) if 1 / 3 != jamo_levenshtein('가나', '가남'): raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format( jamo_levenshtein('가나', '가남'))) print('all hangle tests have been successed\n')
def get_ed_1(test_word,word_count_split): ed_1 = [] for lexicon_word in word_count_split: if levenshtein(test_word, lexicon_word)==1: ed_1.append(lexicon_word) return ed_1
# In[7]: word_count[0] # In[112]: # test ed search test_word = '아몬디' ed_list = {} for lexicon_word in word_count[0].keys(): if levenshtein(test_word, lexicon_word)==1: print(lexicon_word) # ed = jamo_levenshtein(test_word, lexicon_word) # ed_list[lexicon_word] = round(ed,3) # ed_df = pd.DataFrame({'food' : list(ed_list.keys()), 'ed' : list(ed_list.values())}, index=range(len(ed_list))).sort_values(by='ed') # min_ed = ed_df['ed'].values[0] # if min_ed > 0.667: #한글자만 다른경우 0.667 마지노선 # print('better to keep rather than correct') # else: # corr = ed_df['food'].values[0] # In[113]:
# print(MIN, ' ', MIN/(len(ebs_stnc)+len(google_stnc))) # print() # print('ebs : ',ebs_stnc) # print() # print('google : ',MIN_stnc) # print() # print('*******'*10) # # 유사 문장 확인 # In[18]: for e, g in zip(ws_removed_EBS, most_similar_GOOGLE): nor_num = levenshtein(e, g)/((len(e)+len(g))/2) print('*********'*10) if nor_num < 0.1: print(levenshtein(e, g), ' ', nor_num, ' GOOD') elif nor_num > 0.5: print(levenshtein(e, g), ' ', nor_num, ' BAAD') else: print(levenshtein(e, g), ' ', nor_num) print() print('index : ', ws_removed_EBS.index(e), ' ',ws_removed_GOOGLE.index(g)) print() print('ebs : ', e) print() print('google : ', g) print()