示例#1
0
def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein
    
    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))
    
    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))
    
    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))
    
    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))
    
    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(compose('ㄱ', 'ㅏ', 'ㅁ')))
    
    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(character_is_korean('감')))
    
    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(character_is_korean('a')))
    
    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(character_is_jaum('ㅋ')))
    
    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(character_is_jaum('ㅗ')))
    
    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))
    
    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(levenshtein('가나', '가남')))
    
    if 0.1 != levenshtein('가나', '가남', {('나', '남'):0.1}):
        raise ValueError("levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(levenshtein('가나', '가남', {('나', '남'):0.1})))
    
    if 1/3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(jamo_levenshtein('가나', '가남')))
    
    print('all hangle tests have been successed\n\n')
示例#2
0
 def search_book(self, search_text, n=10):
     search_table = self.book_table.copy()
     search_table["JamoEditDis"] = [soyh.jamo_levenshtein(t, search_text) for t in search_table["BookTitle"].tolist()]
     search_table["levEditDis"] = [soyh.levenshtein(t, search_text) for t in search_table["BookTitle"].tolist()]
     search_table["EditDis"] = np.mean(search_table[["JamoEditDis", "levEditDis"]], axis=1)
     search_table = search_table.sort_values("EditDis")
     return search_table.head(n)
示例#3
0
def getTextResemblance(text1, text2):
    cost = {('ㅡ', 'ㅜ'): 0.4, ('ㅈ', 'ㅊ'): 0.4, ('ㅗ', 'ㅜ'): 0.4, \
            ('ㅁ', 'ㅇ'): 0.3, ('ㄹ', 'ㅌ'): 0.3, ('ㅗ', 'ㅛ'): 0.3, ('ㅏ', 'ㅣ'): 0.3}
    if len(text1) > len(text2):
        divider = len(text1)
    else:
        divider = len(text2)
    return 1 - jamo_levenshtein(text1, text2, cost) / divider
示例#4
0
def get_jamo_levenshtein_words(voca_ls, q):
    min_voca = None
    min_score = None
    try:
        s = timeit.default_timer()
        voca_l = list(map(lambda x: (jamo_levenshtein(x, q), x), voca_ls))
        leven_score_l = list(map(lambda x: jamo_levenshtein(x, q), voca_ls))
        if leven_score_l:
            min_word_t = voca_l[leven_score_l.index(min(leven_score_l))]
            min_voca = min_word_t[1]
            min_score = min_word_t[0]
    except Exception as e:
        log.error({'error': str(e)})

    ttime = timeit.default_timer() - s
    log.debug('min_voca:%s, min_score:%s' % (min_voca, min_score))
    log.debug('get_jamo_levenshtein_words ttime:%s' % ttime)
    return min_voca, min_score
示例#5
0
 def checkOOV(self, words):
     new_words = []
     for w in words:
         if w in self.vocab:
             new_words.append(w)
         else:
             baseline = 0.7
             new_w = ""
             for v in self.vocab:
                 distance = jamo_levenshtein(v, w)
                 if distance <= baseline:
                     baseline = distance
                     new_w = v
             # 유사한 단어가 있을 때
             if new_w != "" and baseline <= 0.7:
                 new_words.append(new_w)
             # 유사한 단어가 없을 때
             else:
                 new_words.append(w)
     # print(new_words)
     return new_words
示例#6
0
def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein

    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))

    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))

    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))

    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))

    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(
            compose('ㄱ', 'ㅏ', 'ㅁ')))

    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(
            character_is_korean('감')))

    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(
            character_is_korean('a')))

    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(
            character_is_jaum('ㅋ')))

    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(
            character_is_jaum('ㅗ')))

    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(
            levenshtein('가나', '가남')))

    if 0.1 != levenshtein('가나', '가남', {('나', '남'): 0.1}):
        raise ValueError(
            "levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(
                levenshtein('가나', '가남', {('나', '남'): 0.1})))

    if 1 / 3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(
            jamo_levenshtein('가나', '가남')))

    print('all hangle tests have been successed\n')
示例#7
0
pre_word = test_word[:typo_idx]
post_word = test_word[typo_idx+1:]
exception_word = pre_word + post_word


# In[135]:


ed_1 = []
for product in word_count[0].keys():
    if levenshtein(exception_word, product) ==1:
        ed_1.append(product)
#         correction_word = product
jamo_ed_1 = {}
for ed_ in ed_1:
    jamo_ed = round(jamo_levenshtein(exception_word, ed_),2)
    jamo_ed_1[ed_] = jamo_ed


# In[136]:


ed_1


# In[137]:


jamo_ed_1

示例#8
0
def takeOrder(products, pos_list):

    all_products = products.find({})
    products_list = [elem["name"] for elem in all_products]

    message = ''
    clean_pos = list()
    for word, pos in pos_list:
        if word == '한' and 'XSA+' in pos: continue
        clean_pos.append((word, pos))
        message += word

    msg_length = len(message)
    tmp_res = list()

    for WORD_LENGTH in range(4, 8):
        for i in range(msg_length - WORD_LENGTH):
            tmp_word = message[i:i + WORD_LENGTH]
            for elem in products_list:
                if len(elem) != WORD_LENGTH: continue
                tmp_cost = jamo_levenshtein(tmp_word, elem)
                if tmp_cost >= 1.3: continue
                tmp_res.append((tmp_cost, elem, tmp_word, i))

    if not tmp_res: return None

    tmp_res.sort(key=lambda x: x[0])

    fin_res = dict()
    for _, menu, word, i in tmp_res:
        if menu not in fin_res: fin_res[menu] = (word, i)
        else: continue

    pos_length = len(clean_pos)
    checkPos = [['', ''] for _ in range(pos_length)]

    for menu in fin_res:
        word = fin_res[menu][0]
        for i in range(pos_length):
            tmp_word, _ = clean_pos[i]

            if tmp_word == word:
                checkPos[i][0] = 'PRD'
                checkPos[i][1] = menu
                break

            elif len(tmp_word) < len(word):
                if tmp_word not in word: continue
                tmp_check = False
                if 0 < i and clean_pos[i - 1][0] in word: tmp_check = True
                if i < pos_length - 1 and clean_pos[i + 1][0] in word:
                    tmp_check = True

                if tmp_check:
                    checkPos[i][0] = 'PRD'
                    checkPos[i][1] = menu

    idx = 0

    fin_res_opt = dict()
    for menu in fin_res:
        fin_res_opt[menu] = list()

    while idx < pos_length:

        word, pos = clean_pos[idx]
        if checkPos[idx][0] == 'PRD':
            idx += 1
            continue

        if word in ['뜨거운', '핫', '아이스', '차가운'] or pos in ['NR', 'XR', 'SN', 'NR'] \
            or (pos == 'MM' and word in ['한', '두', '네']) or word == '세잔':

            if pos == 'NR' or pos == 'MM':
                word = stringToNum(word)
            elif word == '세잔':
                word = 3
            word = str(word)

            if 0 < idx and checkPos[idx - 1][0] == 'PRD':
                prd = checkPos[idx - 1][1]
                fin_res_opt[prd].append(word)
            elif idx < pos_length - 1 and checkPos[idx + 1][0] == 'PRD':
                prd = checkPos[idx + 1][1]
                fin_res_opt[prd].append(word)
                idx += 1
        idx += 1

    ret = dict()
    for menu in fin_res:
        if fin_res_opt[menu]: fin_res_opt[menu].sort(reverse=True)
        ret[menu] = fin_res_opt[menu]
    return ret