def hangle_test(): from soynlp.hangle import normalize from soynlp.hangle import compose from soynlp.hangle import decompose from soynlp.hangle import character_is_korean from soynlp.hangle import character_is_jaum from soynlp.hangle import character_is_moum from soynlp.hangle import to_base from soynlp.hangle import levenshtein from soynlp.hangle import jamo_levenshtein normalized_ = normalize('123이건테스트ab테스트') if not (normalized_ == '이건테스트 테스트'): raise ValueError('{} should be 이건테스트 테스트'.format(normalized_)) if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')): raise ValueError('decompose("간") -> {}'.format(decompose('간'))) if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')): raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ'))) if not (('ㅋ', ' ', ' ') == decompose('ㅋ')): raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ'))) if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')): raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(compose('ㄱ', 'ㅏ', 'ㅁ'))) if not character_is_korean('감'): raise ValueError('character_is_korean("감") -> {}'.format(character_is_korean('감'))) if character_is_korean('a'): raise ValueError('character_is_korean("a") -> {}'.format(character_is_korean('a'))) if not character_is_jaum('ㅋ'): raise ValueError('character_is_jaum("ㅋ") -> {}'.format(character_is_jaum('ㅋ'))) if character_is_jaum('a'): raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a'))) if not character_is_moum('ㅗ'): raise ValueError('character_is_jaum("ㅗ") -> {}'.format(character_is_jaum('ㅗ'))) if character_is_moum('a'): raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a'))) if not (to_base('ㄱ') == 12593): raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ'))) if 1 != levenshtein('가나', '가남'): raise ValueError("levenshtein('가나', '가남') -> {}".format(levenshtein('가나', '가남'))) if 0.1 != levenshtein('가나', '가남', {('나', '남'):0.1}): raise ValueError("levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(levenshtein('가나', '가남', {('나', '남'):0.1}))) if 1/3 != jamo_levenshtein('가나', '가남'): raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(jamo_levenshtein('가나', '가남'))) print('all hangle tests have been successed\n\n')
def search_book(self, search_text, n=10): search_table = self.book_table.copy() search_table["JamoEditDis"] = [soyh.jamo_levenshtein(t, search_text) for t in search_table["BookTitle"].tolist()] search_table["levEditDis"] = [soyh.levenshtein(t, search_text) for t in search_table["BookTitle"].tolist()] search_table["EditDis"] = np.mean(search_table[["JamoEditDis", "levEditDis"]], axis=1) search_table = search_table.sort_values("EditDis") return search_table.head(n)
def getTextResemblance(text1, text2): cost = {('ㅡ', 'ㅜ'): 0.4, ('ㅈ', 'ㅊ'): 0.4, ('ㅗ', 'ㅜ'): 0.4, \ ('ㅁ', 'ㅇ'): 0.3, ('ㄹ', 'ㅌ'): 0.3, ('ㅗ', 'ㅛ'): 0.3, ('ㅏ', 'ㅣ'): 0.3} if len(text1) > len(text2): divider = len(text1) else: divider = len(text2) return 1 - jamo_levenshtein(text1, text2, cost) / divider
def get_jamo_levenshtein_words(voca_ls, q): min_voca = None min_score = None try: s = timeit.default_timer() voca_l = list(map(lambda x: (jamo_levenshtein(x, q), x), voca_ls)) leven_score_l = list(map(lambda x: jamo_levenshtein(x, q), voca_ls)) if leven_score_l: min_word_t = voca_l[leven_score_l.index(min(leven_score_l))] min_voca = min_word_t[1] min_score = min_word_t[0] except Exception as e: log.error({'error': str(e)}) ttime = timeit.default_timer() - s log.debug('min_voca:%s, min_score:%s' % (min_voca, min_score)) log.debug('get_jamo_levenshtein_words ttime:%s' % ttime) return min_voca, min_score
def checkOOV(self, words): new_words = [] for w in words: if w in self.vocab: new_words.append(w) else: baseline = 0.7 new_w = "" for v in self.vocab: distance = jamo_levenshtein(v, w) if distance <= baseline: baseline = distance new_w = v # 유사한 단어가 있을 때 if new_w != "" and baseline <= 0.7: new_words.append(new_w) # 유사한 단어가 없을 때 else: new_words.append(w) # print(new_words) return new_words
def hangle_test(): from soynlp.hangle import normalize from soynlp.hangle import compose from soynlp.hangle import decompose from soynlp.hangle import character_is_korean from soynlp.hangle import character_is_jaum from soynlp.hangle import character_is_moum from soynlp.hangle import to_base from soynlp.hangle import levenshtein from soynlp.hangle import jamo_levenshtein normalized_ = normalize('123이건테스트ab테스트') if not (normalized_ == '이건테스트 테스트'): raise ValueError('{} should be 이건테스트 테스트'.format(normalized_)) if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')): raise ValueError('decompose("간") -> {}'.format(decompose('간'))) if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')): raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ'))) if not (('ㅋ', ' ', ' ') == decompose('ㅋ')): raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ'))) if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')): raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format( compose('ㄱ', 'ㅏ', 'ㅁ'))) if not character_is_korean('감'): raise ValueError('character_is_korean("감") -> {}'.format( character_is_korean('감'))) if character_is_korean('a'): raise ValueError('character_is_korean("a") -> {}'.format( character_is_korean('a'))) if not character_is_jaum('ㅋ'): raise ValueError('character_is_jaum("ㅋ") -> {}'.format( character_is_jaum('ㅋ'))) if character_is_jaum('a'): raise ValueError('character_is_jaum("a") -> {}'.format( character_is_jaum('a'))) if not character_is_moum('ㅗ'): raise ValueError('character_is_jaum("ㅗ") -> {}'.format( character_is_jaum('ㅗ'))) if character_is_moum('a'): raise ValueError('character_is_jaum("a") -> {}'.format( character_is_jaum('a'))) if not (to_base('ㄱ') == 12593): raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ'))) if 1 != levenshtein('가나', '가남'): raise ValueError("levenshtein('가나', '가남') -> {}".format( levenshtein('가나', '가남'))) if 0.1 != levenshtein('가나', '가남', {('나', '남'): 0.1}): raise ValueError( "levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format( levenshtein('가나', '가남', {('나', '남'): 0.1}))) if 1 / 3 != jamo_levenshtein('가나', '가남'): raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format( jamo_levenshtein('가나', '가남'))) print('all hangle tests have been successed\n')
pre_word = test_word[:typo_idx] post_word = test_word[typo_idx+1:] exception_word = pre_word + post_word # In[135]: ed_1 = [] for product in word_count[0].keys(): if levenshtein(exception_word, product) ==1: ed_1.append(product) # correction_word = product jamo_ed_1 = {} for ed_ in ed_1: jamo_ed = round(jamo_levenshtein(exception_word, ed_),2) jamo_ed_1[ed_] = jamo_ed # In[136]: ed_1 # In[137]: jamo_ed_1
def takeOrder(products, pos_list): all_products = products.find({}) products_list = [elem["name"] for elem in all_products] message = '' clean_pos = list() for word, pos in pos_list: if word == '한' and 'XSA+' in pos: continue clean_pos.append((word, pos)) message += word msg_length = len(message) tmp_res = list() for WORD_LENGTH in range(4, 8): for i in range(msg_length - WORD_LENGTH): tmp_word = message[i:i + WORD_LENGTH] for elem in products_list: if len(elem) != WORD_LENGTH: continue tmp_cost = jamo_levenshtein(tmp_word, elem) if tmp_cost >= 1.3: continue tmp_res.append((tmp_cost, elem, tmp_word, i)) if not tmp_res: return None tmp_res.sort(key=lambda x: x[0]) fin_res = dict() for _, menu, word, i in tmp_res: if menu not in fin_res: fin_res[menu] = (word, i) else: continue pos_length = len(clean_pos) checkPos = [['', ''] for _ in range(pos_length)] for menu in fin_res: word = fin_res[menu][0] for i in range(pos_length): tmp_word, _ = clean_pos[i] if tmp_word == word: checkPos[i][0] = 'PRD' checkPos[i][1] = menu break elif len(tmp_word) < len(word): if tmp_word not in word: continue tmp_check = False if 0 < i and clean_pos[i - 1][0] in word: tmp_check = True if i < pos_length - 1 and clean_pos[i + 1][0] in word: tmp_check = True if tmp_check: checkPos[i][0] = 'PRD' checkPos[i][1] = menu idx = 0 fin_res_opt = dict() for menu in fin_res: fin_res_opt[menu] = list() while idx < pos_length: word, pos = clean_pos[idx] if checkPos[idx][0] == 'PRD': idx += 1 continue if word in ['뜨거운', '핫', '아이스', '차가운'] or pos in ['NR', 'XR', 'SN', 'NR'] \ or (pos == 'MM' and word in ['한', '두', '네']) or word == '세잔': if pos == 'NR' or pos == 'MM': word = stringToNum(word) elif word == '세잔': word = 3 word = str(word) if 0 < idx and checkPos[idx - 1][0] == 'PRD': prd = checkPos[idx - 1][1] fin_res_opt[prd].append(word) elif idx < pos_length - 1 and checkPos[idx + 1][0] == 'PRD': prd = checkPos[idx + 1][1] fin_res_opt[prd].append(word) idx += 1 idx += 1 ret = dict() for menu in fin_res: if fin_res_opt[menu]: fin_res_opt[menu].sort(reverse=True) ret[menu] = fin_res_opt[menu] return ret