Python compose 예제들, soynlp.hangle.compose Python 예제들

예제 #1

0

파일 보기

파일: basic_test.py 프로젝트: tobby2002/soynlp

def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein
    
    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))
    
    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))
    
    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))
    
    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))
    
    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(compose('ㄱ', 'ㅏ', 'ㅁ')))
    
    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(character_is_korean('감')))
    
    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(character_is_korean('a')))
    
    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(character_is_jaum('ㅋ')))
    
    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(character_is_jaum('ㅗ')))
    
    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(character_is_jaum('a')))
    
    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(levenshtein('가나', '가남')))
    
    if 0.1 != levenshtein('가나', '가남', {('나', '남'):0.1}):
        raise ValueError("levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(levenshtein('가나', '가남', {('나', '남'):0.1})))
    
    if 1/3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(jamo_levenshtein('가나', '가남')))
    
    print('all hangle tests have been successed\n\n')

예제 #2

0

파일 보기

 def process(t):
     assert len(t) % 3 == 0
     t_ = t.replace('-', ' ')
     chars = [tuple(t_[3*i:3*(i+1)]) for i in range(len(t_)//3)]
     recovered = [compose(*char) for char in chars]
     recovered = ''.join(recovered)
     return recovered

예제 #3

0

파일 보기

    def recover_sentence(self, input):
        chr_idx = 0 ; orig = ''
        decoded = RE_huni.decode(self, input)
        while chr_idx < len(input):
            for jamo_set in decoded: # type of jamo_set is tuple
                input_type, Kor_type = RE_huni.determi(jamo_set)
            
                if Kor_type == False: 
                    orig += jamo_set
                    chr_idx += len(jamo_set)

                else: 
                    chr_idx +=1 
                    if input_type == '초중종' or input_type ==  '초중x' : 
                        orig += compose(jamo_set[0], jamo_set[1], jamo_set[2])    
                    elif input_type == '초xx':
                        orig += jamo_set[0]
                    elif input_type == 'x중x':
                        orig += jamo_set[1]

                print(chr_idx, orig)
                if chr_idx < len(input) - 1 and input[chr_idx] == ' ':
                # if input[chr_idx+1] == ' ':
                    orig += ' ' ; chr_idx += 1  
        
        return orig

예제 #4

0

파일 보기

def jamo_to_word(jamo):
    jamo_list, idx = [], 0
    while idx < len(jamo):
        if not character_is_korean(jamo[idx]):
            jamo_list.append(jamo[idx])
            idx += 1
        else:
            jamo_list.append(jamo[idx:idx + 3])
            idx += 3
    word = ""
    for jamo_char in jamo_list:
        if len(jamo_char) == 1:
            word += jamo_char
        elif jamo_char[2] == "-":
            word += compose(jamo_char[0], jamo_char[1], " ")
        else: word += compose(jamo_char[0], jamo_char[1], jamo_char[2])
    return word

예제 #5

0

파일 보기

 def recover_letter(self, input):
     assert type(input) == tuple
     typ, KOR = NLP.determi(input)
     if typ == '초중종' or typ ==  '초중x' : 
         orig = compose(input[0], input[1], input[2])    
     elif typ == '초xx':
         orig = input[0]
     elif typ == 'x중x':
         orig = input[1]
     return orig

예제 #6

0

파일 보기

def jamo_to_word(jamo):
    # idx를 기반으로 순차접근 방식으로 접근하면서 저는 소스가 흐트러졌는데 깔끔합니다.
    jamo_list, idx = [], 0
    word = ""
    while idx < len(jamo):
        if not is_jamo_korean(jamo[idx]):
            jamo_list.append(jamo[idx])
            idx += 1
        else:
            jamo_list.append(jamo[idx:idx + 3])
            idx += 3
            word = ""
    for jamo_char in jamo_list:
        if len(jamo_char) == 1:
            word += jamo_char
        elif jamo_char[2] == "-":
            word += compose(jamo_char[0], jamo_char[1], " ")
        else:
            word += compose(jamo_char[0], jamo_char[1], jamo_char[2])
    return word

예제 #7

0

파일 보기

 def process(t):
     assert len(t) % 3 == 0
     t_ = t.replace('-', ' ')
     chars = [tuple(t_[3 * i:3 * (i + 1)]) for i in range(len(t_) // 3)]
     recovered = list()
     for char in chars:
         try:
             recovered.append(compose(*char))
         except ValueError:
             recovered.append(''.join(char).replace('-', '').strip())
     recovered = ''.join(recovered)
     return recovered

예제 #8

0

파일 보기

파일: _normalizer.py 프로젝트: parksjin01/soynlp-2.7

def _normalize_emoji(token):
    if len(token) <= 1:
        return token
    token_ = []
    decomposeds = [decompose(c) for c in token]
    for char, cd, nd in zip(token, decomposeds, decomposeds[1:]):
        if cd == None or nd == None:
            token_.append(char)
            continue
        # 앜ㅋㅋㅋㅋ -> 아ㅋㅋㅋㅋㅋ
        if (nd[1] == ' ') and (cd[2] == nd[0]):
            token_.append(compose(cd[0], cd[1], ' ') + nd[0])
        # ㅋ쿠ㅜㅜ -> ㅋㅋㅜㅜㅜ
        elif (cd[2] == ' ') and (nd[0] == ' ') and (cd[1] == nd[1]):
            token_.append((cd[0] + cd[1]) if cd [0] != ' ' else cd[1])
        else:
            token_.append(char)
    return ''.join(token_) + token[-1]

예제 #9

0

파일 보기

파일: _normalizer.py 프로젝트: ysseo91/soynlp

def emoticon_normalize(sent, num_repeats=2):
    if not sent:
        return sent

    # Pattern matching ㅋ쿠ㅜ
    def pattern(idx):
        # Jaum: 0, Moum: 1, Complete: 2, else -1
        if 12593 <= idx <= 12622:
            return 0
        elif 12623 <= idx <= 12643:
            return 1
        elif 44032 <= idx <= 55203:
            return 2
        else:
            return -1

    idxs = [pattern(ord(c)) for c in sent]
    sent_ = []
    last_idx = len(idxs) - 1
    for i, (idx, c) in enumerate(zip(idxs, sent)):
        if (i > 0 and i < last_idx) and (idxs[i - 1] == 0 and idx == 2
                                         and idxs[i + 1] == 1):
            cho, jung, jong = decompose(c)
            if (cho == sent[i - 1]) and (jung == sent[i + 1]) and (jong
                                                                   == ' '):
                sent_.append(cho)
                sent_.append(jung)
            else:
                sent_.append(c)
        elif (i < last_idx) and (idx == 2) and (idxs[i + 1] == 0):
            cho, jung, jong = decompose(c)
            if (jong == sent[i + 1]):
                sent_.append(compose(cho, jung, ' '))
                sent_.append(jong)
        elif (i > 0) and (idx == 2 and idxs[i - 1] == 0):
            cho, jung, jong = decompose(c)
            if (cho == sent[i - 1]):
                sent_.append(cho)
                sent_.append(jung)
        else:
            sent_.append(c)
    return repeat_normalize(''.join(sent_), num_repeats)

예제 #10

0

파일 보기

파일: _lemmatizer.py 프로젝트: zeroday0619/soynlp

def lemma_candidate_chat(l, r, predefined=None, debug=False):
    def add_lemma(stem, ending):
        candidates.add((stem, ending))

    def character_is_emoticon(c):
        return c in set('ㄷㅂㅅㅇㅋㅎ')

    candidates = lemma_candidate(l, r, predefined, debug)
    l_last = decompose(l[-1])

    # 어미가 ㄷ, ㅂ, ㅅ, ㅇ, ㅋ, ㅎ 일 경우,
    # (아닏, 아닙, 아닛, 아닝, 아닠, 아닣)
    # (그랟, 그랩, 그랫, 그랭, 그랰, 그랳)
    if not r and character_is_emoticon(l_last[2]):
        l_ = l[:-1] + compose(l_last[0], l_last[1], ' ')
        if debug:
            debug_message('마지막 종성이 이모티콘으로 의심되는 경우', l_, '()')
        candidates.update(lemma_candidate(l_, r, predefined, debug))

    return candidates

예제 #11

0

파일 보기

def conjugate_chat(stem, ending, enforce_moum_harmoney=False, debug=False):
    if not ending:
        return {stem}

    candidates = conjugate(stem, ending, enforce_moum_harmoney, debug)

    l_len = len(stem)
    l_last = list(decompose(stem[-1]))
    l_last_ = stem[-1]
    r_first = list(decompose(ending[0]))

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if r_first[1] == ' ' and r_first[0] != ' ':
        l = stem[:-1] + compose(l_last[0], l_last[1], r_first[0])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if r_first[1] != ' ':
            candidates.add(stem + ending)
        if debug:
            print('어미의 첫 글자가 자음인 경우: {}'.format(surface))

    return candidates

예제 #12

0

파일 보기

파일: basic_test.py 프로젝트: ysseo91/soynlp

def hangle_test():
    from soynlp.hangle import normalize
    from soynlp.hangle import compose
    from soynlp.hangle import decompose
    from soynlp.hangle import character_is_korean
    from soynlp.hangle import character_is_jaum
    from soynlp.hangle import character_is_moum
    from soynlp.hangle import to_base
    from soynlp.hangle import levenshtein
    from soynlp.hangle import jamo_levenshtein

    normalized_ = normalize('123이건테스트ab테스트')
    if not (normalized_ == '이건테스트 테스트'):
        raise ValueError('{} should be 이건테스트 테스트'.format(normalized_))

    if not (('ㄱ', 'ㅏ', 'ㄴ') == decompose('간')):
        raise ValueError('decompose("간") -> {}'.format(decompose('간')))

    if not ((' ', 'ㅗ', ' ') == decompose('ㅗ')):
        raise ValueError('decompose("ㅗ") -> {}'.format(decompose('ㅗ')))

    if not (('ㅋ', ' ', ' ') == decompose('ㅋ')):
        raise ValueError('decompose("ㅋ") -> {}'.format(decompose('ㅋ')))

    if not ('감' == compose('ㄱ', 'ㅏ', 'ㅁ')):
        raise ValueError("compose('ㄱ', 'ㅏ', 'ㅁ') -> {}".format(
            compose('ㄱ', 'ㅏ', 'ㅁ')))

    if not character_is_korean('감'):
        raise ValueError('character_is_korean("감") -> {}'.format(
            character_is_korean('감')))

    if character_is_korean('a'):
        raise ValueError('character_is_korean("a") -> {}'.format(
            character_is_korean('a')))

    if not character_is_jaum('ㅋ'):
        raise ValueError('character_is_jaum("ㅋ") -> {}'.format(
            character_is_jaum('ㅋ')))

    if character_is_jaum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not character_is_moum('ㅗ'):
        raise ValueError('character_is_jaum("ㅗ") -> {}'.format(
            character_is_jaum('ㅗ')))

    if character_is_moum('a'):
        raise ValueError('character_is_jaum("a") -> {}'.format(
            character_is_jaum('a')))

    if not (to_base('ㄱ') == 12593):
        raise ValueError('to_base("ㄱ") -> {}'.format(to_base('ㄱ')))

    if 1 != levenshtein('가나', '가남'):
        raise ValueError("levenshtein('가나', '가남') -> {}".format(
            levenshtein('가나', '가남')))

    if 0.1 != levenshtein('가나', '가남', {('나', '남'): 0.1}):
        raise ValueError(
            "levenshtein('가나', '가남', {('나', '남'):0.1}) -> {}".format(
                levenshtein('가나', '가남', {('나', '남'): 0.1})))

    if 1 / 3 != jamo_levenshtein('가나', '가남'):
        raise ValueError("jamo_levenshtein('가나', '가남') -> {}".format(
            jamo_levenshtein('가나', '가남')))

    print('all hangle tests have been successed\n')

예제 #13

0

파일 보기

def conjugate(stem, ending, enforce_moum_harmoney=False, debug=False):

    assert ending  # ending must be inserted

    l_len = len(stem)
    l_last = list(decompose(stem[-1]))
    l_last_ = stem[-1]
    r_first = list(decompose(ending[0]))

    # check moum is positive or negative
    # ㅂ 불규칙 활용은 모음조화가 이뤄지지 않는 경우가 있음
    if enforce_moum_harmoney:
        if ((l_last[2] != 'ㅂ' and l_last[1] in positive_moum)
                and (r_first[0] == 'ㅇ' and r_first[1] in negative_moum)):
            r_first[1] = neg_to_pos[r_first[1]]
            ending = compose(*r_first) + ending[1:]
        if ((l_last[2] != 'ㅂ' and l_last[1] in negative_moum)
                and (r_first[0] == 'ㅇ' and r_first[1] in positive_moum)):
            r_first[1] = pos_to_neg[r_first[1]]
            ending = compose(*r_first) + ending[1:]
        if (l_last[1] in neuter_moum) and (r_first[1] in positive_moum):
            r_first[1] = pos_to_neg[r_first[1]]
            ending = compose(*r_first) + ending[1:]

    # -는 vs -ㄴ / -ㄴ, -ㄹ, -ㅂ, -ㅆ
    #if ((l_last[2] == ' ') and
    #    ((r_first[0] == 'ㅇ' or r_first[0] == r_first[2]) and (r_first[1] == 'ㅣ' or r_first[1] == 'ㅡ'))):
    #    r_first = [r_first[2], ' ', ' ']
    #    ending = r_first[2] + ending[1:]

    r_first_ = compose(r_first[0], r_first[1],
                       ' ') if r_first[1] != ' ' else ending[0]

    candidates = set()

    if debug:
        print('l_last = {}'.format(l_last))
        print('r_first = {}'.format(r_first))

    if ending[0] == '다':
        surface = stem + ending
        candidates.add(surface)
        if debug:
            print('\'다\'로 시작하는 어미: {}'.format(surface))

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ' and r_first[0] == 'ㅇ':
        l = stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        surface = l + ending
        candidates.add(surface)
        candidates.add(stem + ending)  # 받 + 았다 -> 받았다
        if debug:
            print('ㄷ 불규칙: {}'.format(surface))

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if ((l_last_ == '르' and stem[-2:] != '푸르')
            and (r_first_ == '아' or r_first_ == '어') and l_len >= 2):
        c0, c1, c2 = decompose(stem[-2])
        l = stem[:-2] + compose(c0, c1, 'ㄹ')
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('르 불규칙: {}'.format(surface))

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ'):
        l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        if (r_first_ == '어' or r_first_ == '아'):
            if l_len >= 2 and (l_last_ == '답' or l_last_ == '곱'
                               or l_last_ == '깝' or l_last_ == '롭'):
                c1 = 'ㅝ'
            elif r_first[1] == 'ㅗ':
                c1 = 'ㅘ'
            elif r_first[1] == 'ㅜ':
                c1 = 'ㅝ'
            elif r_first_ == '어':
                c1 = 'ㅝ'
            else:  # r_first_ == '아'
                c1 = 'ㅘ'
            r = compose('ㅇ', c1, r_first[2]) + ending[1:]
            surface = l + r
            candidates.add(surface)
            if debug:
                print('ㅂ 불규칙: {}'.format(surface))
        elif r_first[0] == 'ㅇ':  # 돕 + 울까 = 도울까, 답 + 울까 = 다울까
            surface = l + ending
            candidates.add(surface)
            if debug:
                print('ㅂ 불규칙: {}'.format(surface))

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if r_first[1] == ' ' and (r_first[0] == 'ㄴ' or r_first[0] == 'ㄹ'
                              or r_first[0] == 'ㅁ' or r_first[0] == 'ㅂ'
                              or r_first[0] == 'ㅆ'):
        l = stem[:-1] + compose(l_last[0], l_last[1], r_first[0])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if r_first[1] != ' ':
            candidates.add(stem + ending)
        if debug:
            print('어미의 첫 글자가 -ㄴ, -ㄹ, -ㅁ-, -ㅂ, -ㅆ 인 경우: {}'.format(surface))

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == 'ㅅ') and (r_first[0] == 'ㅇ'):
        if stem[-1] == '벗':
            l = stem
        else:
            l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        surface = l + ending
        candidates.add(surface)
        if debug:
            print('ㅅ 불규칙: {}'.format(surface))

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[
            1] == 'ㅓ':
        if l_last_ == '푸':
            l = stem[:-1] + '퍼'
        else:
            l = stem[:-1] + compose(l_last[0], 'ㅝ', r_first[2])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('우 불규칙: {}'.format(surface))

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[
            1] == 'ㅏ':
        l = stem[:-1] + compose(l_last[0], 'ㅘ', r_first[2])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('오 활용: {}'.format(surface))

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if ((l_last[1] == 'ㅡ') and (l_last[2] == ' ') and (r_first[0] == 'ㅇ')):
        if l_last[0] == 'ㅇ' and len(stem) > 1:
            surface = stem[:-1] + ending
        elif l_last[0] != 'ㄹ':
            surface = stem[:-1] + compose(l_last[0], r_first[1],
                                          r_first[2]) + ending[1:]
        else:
            surface = None
        if surface is not None:
            candidates.add(surface)
        if debug and surface is not None:
            print('ㅡ 탈락 불규칙: {}'.format(surface))

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용: 최근에는 인정되지 않는 규칙
    if ending[:2] == '어라' or ending[:2] == '아라':
        # 돌아오 + 아라 -> 돌아와라
        if stem[-1] == '오':
            l = stem[:-1]
            r = '와' + ending[1:]
        # 그리우 + 어라 -> 그리워라
        elif stem[-1] == '우':
            l = stem[:-1]
            r = '워' + ending[1:]
        # 가 + 아라 -> 가라
        elif stem[-1] == '가':
            l = stem
            r = ending[1:]
        else:
            if l_last[1] in negative_moum:
                l = stem
                r = '어' + ending[1:]
            else:
                l = stem
                r = '아' + ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('거라/너라 불규칙: {}'.format(surface))

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다
    if ((l_last_ == '르' and stem[-2:] != '구르')
            and (r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ')):
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        surface = stem + r
        candidates.add(surface)
        if debug:
            print('러 불규칙: {}'.format(surface))

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    if l_last_ == '하' and r_first[0] == 'ㅇ' and (r_first[1] == 'ㅏ'
                                                 or r_first[1] == 'ㅓ'):
        # case 1
        r = compose(r_first[0], 'ㅕ', r_first[2]) + ending[1:]
        surface0 = stem + r
        candidates.add(surface0)
        # case 2
        l = stem[:-1] + compose('ㅎ', 'ㅐ', r_first[2])
        r = ending[1:]
        surface1 = l + r
        candidates.add(surface1)
        if debug:
            print('여 불규칙: {}, {}'.format(surface0, surface1))

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다
    if l_last[2] == 'ㅎ' and r_first[1] != ' ':
        if l_last_ == '좋' or l_last_ == '놓':
            l = stem
        else:
            l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        r = ending
        surface = l + r
        candidates.add(surface)
        if debug:
            print('ㅎ 탈락 불규칙: {}'.format(surface))

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if ((l_last[2] == 'ㅎ' and l_last_ != '좋') and
        (r_first[0] == 'ㅇ' and r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ')):
        l = stem[:-1] + compose(l_last[0], 'ㅐ' if r_first[1] == 'ㅏ' else 'ㅔ',
                                r_first[2])
        r = ending[1:]
        surface = l + r
        candidates.add(surface)
        if debug:
            print('ㅎ 축약 불규칙: {}'.format(surface))

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음
    if l_last[2] == 'ㅎ' and r_first[0] == 'ㄴ' and r_first[1] != ' ':
        surface = stem + ending
        candidates.add(surface)
        if debug:
            print('ㅎ + 네 불규칙: {}'.format(surface))

    # 이 + 어 -> 여 규칙활용, 만지 + 었어 -> 만졌어, 만지 + 어서 -> 만져서
    if r_first_ == '어' and l_last[1] == 'ㅣ' and l_last[2] == ' ':
        surface = stem[:-1] + compose(l_last[0], 'ㅕ', r_first[2]) + ending[1:]
        candidates.add(surface)
        surface = stem + ending
        candidates.add(surface)
        if debug:
            print('이 + 어 -> 여 규칙: {}'.format(surface))

    if not candidates and r_first[1] != ' ':
        if (l_last[2] == ' ') and (r_first[0] == 'ㅇ') and (r_first[1]
                                                           == l_last[1]):
            l = stem[:-1] + compose(l_last[0], l_last[1], r_first[2])
            r = ending[1:]
            surface = l + r
            candidates.add(surface)
        else:
            surface = stem + ending
            candidates.add(surface)
        if debug:
            print('L + R 규칙 결합: {}'.format(surface))

    return candidates

예제 #14

0

파일 보기

파일: _lemmatizer.py 프로젝트: zeroday0619/soynlp

def lemma_candidate(l, r, predefined=None, debug=False):
    def add_lemma(stem, ending):
        candidates.add((stem, ending))

    candidates = {(l, r)}
    word = l + r

    l_last = decompose(l[-1])
    l_last_ = compose(l_last[0], l_last[1], ' ')
    l_front = l[:-1]
    r_first = decompose(r[0]) if r else ('', '', '')
    r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '
    r_end = r[1:]

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨닫 + 아
    if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㄷ')
        add_lemma(l_stem, r)
        if debug:
            debug_message('ㄷ 불규칙 활용', l_stem, r)

    # 르 불규칙 활용: 굴 + 러 -> 구르 + 어
    if (l_last[2] == 'ㄹ') and (r_first_ == '러' or r_first_ == '라'):
        l_stem = l_front + compose(l_last[0], l_last[1], ' ') + '르'
        r_canon = compose('ㅇ', r_first[1], r_first[2]) + r_end
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('르 불규칙 활용', l_stem, r_canon)

    # ㅂ 불규칙 활용: 더러 + 워서 -> 더럽 + 어서
    if (l_last[2] == ' '):
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㅂ')
        if (r_first_ == '워' or r_first_ == '와'):
            r_canon = compose('ㅇ', 'ㅏ' if r_first_ == '와' else 'ㅓ',
                              r_first[2] if r_first[2] else ' ') + r_end
        elif (r_end and r_end[0] == '려'):
            r_canon = compose('ㅇ', 'ㅜ',
                              r_first[2] if r_first[2] else ' ') + r_end
        else:
            r_canon = r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('ㅂ 불규칙 활용', l_stem, r_canon)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅁ-, -ㅂ, -ㅆ)
    # 입 + 니다 -> 이 + ㅂ니다
    if l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅁ' or l_last[
            2] == 'ㅂ' or l_last[2] == 'ㅆ':
        for jongsung in ' ㄹㅂㅎ':
            if l_last[2] == jongsung:
                continue
            l_stem = l_front + compose(l_last[0], l_last[1], jongsung)
            r_canon = l_last[2] + r
            add_lemma(l_stem, r_canon)
            if debug:
                debug_message('어미의 첫글자가 종성일 경우 (%s)' % jongsung, l_stem,
                              r_canon)

    # ㅅ 불규칙 활용: 부 + 어 -> 붓 + 어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == ' ' and l[-1] != '벗') and (r_first[0] == 'ㅇ'):
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㅅ')
        add_lemma(l_stem, r)
        if debug:
            debug_message('ㅅ 불규칙 활용', l_stem, r)

    # 우 불규칙 활용: 똥퍼 + '' -> 똥푸 + 어
    if l_last_ == '퍼':
        l_stem = l_front + '푸'
        r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('우 불규칙 활용 (퍼)', l_stem, r_canon)

    # 우 불규칙 활용: 줬 + 어 -> 주 + 었어
    if l_last[1] == 'ㅝ':
        l_stem = l_front + compose(l_last[0], 'ㅜ', ' ')
        r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('우 불규칙 활용', l_stem, r_canon)

    # 오 불규칙 활용: 왔 + 어 -> 오 + 았어
    if l_last[1] == 'ㅘ':
        l_stem = l_front + compose(l_last[0], 'ㅗ', ' ')
        r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('오 불규칙 활용', l_stem, r_canon)

    # ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄 + 어 / 텄 + 어 -> 트 + 었어
    if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
        l_stem = l_front + compose(l_last[0], 'ㅡ', ' ')
        r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('ㅡ 탈락 불규칙 활용 (꺼)', l_stem, r_canon)

    # ㅡ 탈락 불규칙 활용: 모 + 았다 -> 모으 + 았다
    if l_last[2] == ' ' and r_first[0] == 'ㅇ' and (r_first[1] == 'ㅏ'
                                                   or r_first[1] == 'ㅓ'):
        l_stem = l + '으'
        r_canon = r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('ㅡ 탈락 불규칙 활용 (모으)', l_stem, r_canon)

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용
    # if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
    #    # TODO

    # 러 불규칙 활용: 이르 + 러 -> 이르다
    # if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
    #     if self.is_stem(l):
    #         # TODO

    # 여 불규칙 활용
    # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 취급하면 규칙 활용

    # 여 불규칙 활용 (2)
    # 했 + 다 -> 하 + 았다 / 해 + 라니깐 -> 하 + 아라니깐 / 했 + 었다 -> 하 + 았었다
    if l_last[0] == 'ㅎ' and l_last[1] == 'ㅐ':
        l_stem = l_front + '하'
        r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
        add_lemma(l_stem, r_canon)
        if debug:
            debug_message('여 불규칙 활용', l_stem, r_canon)

    # ㅎ (탈락) 불규칙 활용
    if (l_last[2] == ' ' or l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ'
            or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ'):
        # 파라 + 면 -> 파랗 + 면
        if (l_last[1] == 'ㅏ' or l_last[1] == 'ㅓ'):
            l_stem = l_front + compose(l_last[0], l_last[1], 'ㅎ')
            r_canon = r if l_last[2] == ' ' else l_last[2] + r
            add_lemma(l_stem, r_canon)
            if debug:
                debug_message('ㅎ 탈락 불규칙 활용', l_stem, r_canon)
        # ㅎ (축약) 불규칙 할용
        # 시퍼렜 + 다 -> 시퍼렇 + 었다, 파랬 + 다 -> 파랗 + 았다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            # exception : 그렇 + 아 -> 그래
            if len(l) >= 2 and l[-2] == '그' and l_last[0] == 'ㄹ':
                l_stem = l_front + '렇'
            else:
                l_stem = l_front + compose(
                    l_last[0], 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            r_canon = compose('ㅇ', 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ',
                              l_last[2]) + r
            add_lemma(l_stem, r_canon)
            if debug:
                debug_message('ㅎ 축약 불규칙 활용', l_stem, r_canon)

    # 이었 -> 였 규칙활용
    # 좋아졌 + 어 -> 좋아지 + 었어, 좋아졋 + 던 -> 좋아지 + 었던, 좋아져 + 서 -> 좋아지 + 어서
    # 였 + 어 -> 이 + 었어
    # 종성 ㅆ 을 ㅅ 으로 쓰는 경우도 고려 (자주 등장하는 맞춤법 오류)
    if ((l_last[2] == 'ㅆ' or l_last[2] == 'ㅅ' or l_last[2] == ' ')
            and (l_last[1] == 'ㅕ')):

        # except: -었 -> 이 + 었 (x) // -였-> 이 + 었 (o) // -졌 -> 지 + 었 (o) // -젔 -> 지 + 었
        if ((l_last[0] == 'ㅇ') and
            (l_last[1] == 'ㅕ')) or not (l_last[0] == 'ㅇ'):
            l_stem = l_front + compose(l_last[0], 'ㅣ', ' ')
            r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
            add_lemma(l_stem, r_canon)
            if debug:
                debug_message('이었 -> 였 규칙 활용', l_stem, r_canon)

    ## Pre-defined set
    if predefined and (l, r) in predefined:
        for stem in predefined[(l, r)]:
            candidates.add(stem)
            if debug:
                debug_message('Predefined', l_stem, r_canon)

    # check whether lemma is conjugatable
    candidates_ = set()
    for stem, eomi in candidates:
        if not eomi:
            continue
        # hard rule
        if decompose(eomi[0])[2] == 'ㅎ':
            continue
        surfaces = conjugate(stem, eomi)
        if word in surfaces:
            candidates_.add((stem, eomi))
    return candidates_

예제 #15

0

파일 보기

파일: _lemmatizer.py 프로젝트: tobby2002/soynlp

    def _candidates(self, l, r):
        candidates = {(l, r)}

        l_last = decompose(l[-1])
        l_last_ = compose(l_last[0], l_last[1], ' ')
        r_first = decompose(r[0]) if r else ('', '', '')
        r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '

        # ㄷ 불규칙 활용: 깨달 + 아 -> 깨닫 + 아
        if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㄷ')
            candidates.add((l_root, r))

        # 르 불규칙 활용: 굴 + 러 -> 구르 + 어
        if (l_last[2] == 'ㄹ') and (r_first_ == '러' or r_first_ == '라'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ') + '르'
            r_canon = compose('ㅇ', r_first[1], r_first[2]) + r[1:]
            candidates.add((l_root, r_canon))

        # ㅂ 불규칙 활용: 더러 + 워서 -> 더럽 + 어서
        if (l_last[2] == ' ') and (r_first_ == '워' or r_first_ == '와'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅂ')
            r_canon = compose('ㅇ', 'ㅏ' if r_first_ == '와' else 'ㅓ', r_first[2]) + r[1:]
            candidates.add((l_root, r_canon))

#         # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅅ)
#         # 입 + 니다 -> 이 + ㅂ니다
        if l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ')
            r_canon = l_last[2] + r
            candidates.add((l_root, r_canon))

#         # ㅅ 불규칙 활용: 부 + 어 -> 붓 + 어
#         # exception : 벗 + 어 -> 벗어
        if (l_last[2] == ' ' and l[-1] != '벗') and (r_first[0] == 'ㅇ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅅ')
            candidates.add((l_root, r))

        # 우 불규칙 활용: 똥퍼 + '' -> 똥푸 + 어
        if l_last_ == '퍼':
            l_root = l[:-1] + '푸'
            r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 우 불규칙 활용: 줬 + 어 -> 주 + 었어
        if l_last[1] == 'ㅝ':
            l_root = l[:-1] + compose(l_last[0], 'ㅜ', ' ')
            r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 오 불규칙 활용: 왔 + 어 -> 오 + 았어
        if l_last[1] == 'ㅘ':
            l_root = l[:-1] + compose(l_last[0], 'ㅗ', ' ')
            r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄 + 어 / 텄 + 어 -> 트 + 었어
        if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
            l_root = l[:-1] + compose(l_last[0], 'ㅡ', ' ')
            r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 거라, 너라 불규칙 활용
        # '-거라/-너라'를 어미로 취급하면 규칙 활용
        # if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
        #    # TODO

        # 러 불규칙 활용: 이르 + 러 -> 이르다
        # if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
        #     if self.is_root(l):
        #         # TODO

        # 여 불규칙 활용
        # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 취급하면 규칙 활용

        # 여 불규칙 활용 (2)
        # 했 + 다 -> 하 + 았다 / 해 + 라니깐 -> 하 + 아라니깐 / 했 + 었다 -> 하 + 았었다
        if l_last[0] == 'ㅎ' and l_last[1] == 'ㅐ':
            l_root = l[:-1] + '하'
            r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # ㅎ (탈락) 불규칙 활용
        # 파라 + 면 -> 파랗 + 면
        if (l_last[2] == ' ' or l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅎ')
            r_canon = r if l_last[2] == ' ' else l_last[2] + r
            candidates.add((l_root, r_canon))

        # ㅎ (축약) 불규칙 할용
        # 시퍼렜 + 다 -> 시퍼렇 + 었다, 파랬 + 다 -> 파랗 + 았다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            # exception : 그렇 + 아 -> 그래
            if len(l) >= 2 and l[-2] == '그' and l_last[0] == 'ㄹ':
                l_root = l[:-1] + '렇'
            else:
                l_root = l[:-1] + compose(l_last[0], 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            r_canon = compose('ㅇ', 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        ## Pre-defined set
        if (l, r) in self._predefined:
            for root in self._predefined[(l, r)]:
                candidates.add(root)

        return candidates

예제 #16

0

파일 보기

def pos(s, remove_tag=[], c_tag=[], _opt = '-tip1+sw', _in = 'sample.in', _out = 'sample.out', thread=True):
    try:
        c_tag = c_tags
        # -- 멀티 세션 돌릴때 파일이름 겹치면 오류발생함
        if thread:
            def _idGenerator():
                return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6))
            _in = _idGenerator()+'.in'
            _out = _idGenerator()+'.out'

        _i = gl('*.in')
        if len(_i):
            for i in _i:
                os.remove(i)
        _o = gl('*.out')
        if len(_o):
            for o in _o:
                os.remove(o)
        # 50 단어 마다 줄바꿈 ( 안하면 KLT에서 WARNING 발생 )
        words = s.split()
        s = [ ' '.join(words[i:i + 50]) for i in range(0, len(words), 50) ]
        s = '\n'.join(s)
        f = codecs.open(_in, 'w+', encoding='KSC5601')
        f.write(s)
        f.close()

        command = ["kma.exe",_opt,_in,_out]
        check_call(command, stdout=DEVNULL, stderr=STDOUT)



        os.remove(_in)   # 파일 지우기

        f = codecs.open(_out, encoding='KSC5601')
        tokend_text = f.read()
        f.close()

        os.remove(_out)  # 파일 지우기

        str_token = re.findall(pattern='\([\w ]+\)', string=tokend_text)
        poses = list(map(_parse, str_token))

        # -- 불용태그 제거
        if len(remove_tag):
            poses = [(w,t) for w,t in poses if t not in remove_tag ]

        chunker = RegexpParser('JOSA:{<t|c><e>}')
        chunks  = chunker.parse(poses)
        chunks  = [chunk.leaves() if type(chunk) != tuple else chunk for chunk in chunks]
        poses = []
        for pos in chunks:
            if type(pos) == list:
                w1, t1 = pos[0]
                jong, t2 = pos[1]
                try:
                    chojung = decompose(w1)
                    w = compose(chojung[0], chojung[1], jong)
                except:
                    w = w1+jong

                if w1 == '하' and jong == '어':
                    w = '해'
                pos = (w, t1+t2)
            for org,cus in c_tag:
                if org == pos:
                    pos = cus
            poses.append(pos)

        # 불용어 제거
        stop_words = [('의','N'),('을','N'),('를','N'),('대한','N'),('인해','N'),('중','N'),('등','N')]
        poses = [ pos for pos in poses if pos not in stop_words]
        return poses
    except:
        return []

예제 #17

0

파일 보기

파일: _lemmatizer.py 프로젝트: songys/soynlp

def _lemma_candidate(l, r, predefined=None):
    def add_lemma(stem, ending):
        candidates.add((stem, ending))

    candidates = {(l, r)}

    l_last = decompose(l[-1])
    l_last_ = compose(l_last[0], l_last[1], ' ')
    l_front = l[:-1]
    r_first = decompose(r[0]) if r else ('', '', '')
    r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '
    r_end = r[1:]

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨닫 + 아
    if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㄷ')
        add_lemma(l_stem, r)

    # 르 불규칙 활용: 굴 + 러 -> 구르 + 어
    if (l_last[2] == 'ㄹ') and (r_first_ == '러' or r_first_ == '라'):
        l_stem = l_front + compose(l_last[0], l_last[1], ' ') + '르'
        r_canon = compose('ㅇ', r_first[1], r_first[2]) + r_end
        add_lemma(l_stem, r_canon)

    # ㅂ 불규칙 활용: 더러 + 워서 -> 더럽 + 어서
    if (l_last[2] == ' ') and (r_first_ == '워' or r_first_ == '와'):
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㅂ')
        r_canon = compose('ㅇ', 'ㅏ' if r_first_ == '와' else 'ㅓ',
                          r_first[2]) + r_end
        add_lemma(l_stem, r_canon)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅅ)
    # 입 + 니다 -> 이 + ㅂ니다
    if l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅂ' or l_last[
            2] == 'ㅆ':
        l_stem = l_front + compose(l_last[0], l_last[1], ' ')
        r_canon = l_last[2] + r
        add_lemma(l_stem, r_canon)

    # ㅅ 불규칙 활용: 부 + 어 -> 붓 + 어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == ' ' and l[-1] != '벗') and (r_first[0] == 'ㅇ'):
        l_stem = l_front + compose(l_last[0], l_last[1], 'ㅅ')
        add_lemma(l_stem, r)

    # 우 불규칙 활용: 똥퍼 + '' -> 똥푸 + 어
    if l_last_ == '퍼':
        l_stem = l_front + '푸'
        r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # 우 불규칙 활용: 줬 + 어 -> 주 + 었어
    if l_last[1] == 'ㅝ':
        l_stem = l_front + compose(l_last[0], 'ㅜ', ' ')
        r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # 오 불규칙 활용: 왔 + 어 -> 오 + 았어
    if l_last[1] == 'ㅘ':
        l_stem = l_front + compose(l_last[0], 'ㅗ', ' ')
        r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄 + 어 / 텄 + 어 -> 트 + 었어
    if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
        l_stem = l_front + compose(l_last[0], 'ㅡ', ' ')
        r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용
    # if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
    #    # TODO

    # 러 불규칙 활용: 이르 + 러 -> 이르다
    # if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
    #     if self.is_stem(l):
    #         # TODO

    # 여 불규칙 활용
    # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 취급하면 규칙 활용

    # 여 불규칙 활용 (2)
    # 했 + 다 -> 하 + 았다 / 해 + 라니깐 -> 하 + 아라니깐 / 했 + 었다 -> 하 + 았었다
    if l_last[0] == 'ㅎ' and l_last[1] == 'ㅐ':
        l_stem = l_front + '하'
        r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
        add_lemma(l_stem, r_canon)

    # ㅎ (탈락) 불규칙 활용
    if (l_last[2] == ' ' or l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ'
            or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ'):
        # 파라 + 면 -> 파랗 + 면
        if (l_last[1] == 'ㅏ' or l_last[1] == 'ㅓ'):
            l_stem = l_front + compose(l_last[0], l_last[1], 'ㅎ')
            r_canon = r if l_last[2] == ' ' else l_last[2] + r
            add_lemma(l_stem, r_canon)
        # ㅎ (축약) 불규칙 할용
        # 시퍼렜 + 다 -> 시퍼렇 + 었다, 파랬 + 다 -> 파랗 + 았다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            # exception : 그렇 + 아 -> 그래
            if len(l) >= 2 and l[-2] == '그' and l_last[0] == 'ㄹ':
                l_stem = l_front + '렇'
            else:
                l_stem = l_front + compose(
                    l_last[0], 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            r_canon = compose('ㅇ', 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ',
                              l_last[2]) + r
            add_lemma(l_stem, r_canon)

    # 이었 -> 였 규칙활용
    # 좋아졌 + 어 -> 좋아지 + 었어, 좋아졋 + 던 -> 좋아지 + 었던
    # 종성 ㅆ 을 ㅅ 으로 쓰는 경우도 고려
    if ((l_last[0] != 'ㅇ') and
        (l_last[2] == 'ㅆ' or l_last[2] == 'ㅅ' or l_last[2] == ' ') and
        (l_last[1] == 'ㅕ') or (l_last[1] == 'ㅓ')):

        l_stem = l_front + compose(l_last[0], 'ㅣ', ' ')
        r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
        add_lemma(l_stem, r_canon)

    ## Pre-defined set
    if predefined and (l, r) in predefined:
        for stem in predefined[(l, r)]:
            candidates.add(stem)

    return candidates

예제 #18

0

파일 보기

파일: _conjugation.py 프로젝트: tobby2002/soynlp

def _conjugate_root(root):

    l_len = len(root)
    l_last = decompose(root[-1])
    l_last_ = root[-1]

    candidates = {root}

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ':
        l = root[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        candidates.add(l)

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if (l_last_ == '르') and l_len >= 2:
        c0, c1, c2 = decompose(root[-2])
        l = root[:-2] + compose(c0, c1, 'ㄹ')
        candidates.add(l)

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ'):
        l = root[:-1] + compose(l_last[0], l_last[1], ' ')
        candidates.add(l)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if l_last[2] == ' ':
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㄴ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㄹ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㅂ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㅆ'))

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == 'ㅅ') and root[-1] != '벗':
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], ' '))

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ':
        if l_last_ == '푸':
            l = '퍼'
        else:
            candidates.add(root[:-1] + compose(l_last[0], 'ㅝ', ' '))
            candidates.add(root[:-1] + compose(l_last[0], 'ㅝ', 'ㅆ'))

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ':
        candidates.add(root[:-1] + compose(l_last[0], 'ㅘ', ' '))
        candidates.add(root[:-1] + compose(l_last[0], 'ㅘ', 'ㅆ'))

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if (l_last_ == '끄' or l_last_ == '크' or l_last_ == '트'):
        candidates.add(root[:-1] + compose(l_last[0], 'ㅓ', ' '))
        candidates.add(root[:-1] + compose(l_last[0], 'ㅓ', 'ㅆ'))

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    # 하 + 았다 -> 했다
    if l_last_ == '하':
        candidates.add(root[:-1] + '해')
        candidates.add(root[:-1] + '했')

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다 / 동그랗 + ㄴ -> 동그란
    if l_last[2] == 'ㅎ' and l_last_ != '좋':
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], ' '))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㄴ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㄹ'))
        # candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㅂ'))
        candidates.add(root[:-1] + compose(l_last[0], l_last[1], 'ㅆ'))

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if l_last[2] == 'ㅎ' and l_last_ != '좋':
        candidates.add(root[:-1] + compose(l_last[0], 'ㅐ', 'ㅆ'))
        # candidates.add(root[:-1] + compose(l_last[0], 'ㅔ', 'ㅆ'))

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음

    return candidates

예제 #19

0

파일 보기

파일: _conjugation.py 프로젝트: songys/soynlp

def conjugate(stem, ending):

    assert ending # ending must be inserted

    l_len = len(stem)
    l_last = decompose(stem[-1])
    l_last_ = stem[-1]
    r_first = decompose(ending[0])
    r_first_ = compose(r_first[0], r_first[1], ' ') if r_first[1] != ' ' else ending[0]

    candidates = set()
    
    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ' and r_first[0] == 'ㅇ':
        l = stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        candidates.add(l + ending)

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if (l_last_ == '르') and (r_first_ == '아' or r_first_ == '어') and l_len >= 2:
        c0, c1, c2 = decompose(stem[-2])
        l = stem[:-2] + compose(c0, c1, 'ㄹ')
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        candidates.add(l + r)

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와 
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ') and (r_first_ == '어' or r_first_ == '아'):
        l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        if l_len >= 2 and (l_last_ == '답' or l_last_ == '곱' or l_last_ == '깝' or l_last_ == '롭'):
            c1 = 'ㅝ'
        elif r_first[1] == 'ㅗ':
            c1 = 'ㅘ'
        elif r_first[1] == 'ㅜ':
            c1 = 'ㅝ'
        elif r_first_ == '어':
            c1 = 'ㅝ'
        else: # r_first_ == '아'
            c1 = 'ㅘ'
        r = compose('ㅇ', c1, r_first[2]) + ending[1:]
        candidates.add(l + r)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if l_last[2] == ' ' and r_first[1] == ' ' and (r_first[0] == 'ㄴ' or r_first[0] == 'ㄹ' or r_first[0] == 'ㅂ' or r_first[0] == 'ㅆ'):
        l = stem[:-1] + compose(l_last[0], l_last[1], r_first[0])
        r = ending[1:]
        candidates.add(l + r)

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어    
    if (l_last[2] == 'ㅅ') and (r_first[0] == 'ㅇ'):
        if stem[-1] == '벗':
            l = stem
        else:
            l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        candidates.add(l + ending)

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ':
        if l_last_ == '푸':
            l = '퍼'
        else:
            l = stem[:-1] + compose(l_last[0], 'ㅝ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅏ':
        l = stem[:-1] + compose(l_last[0], 'ㅘ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if (l_last_ == '끄' or l_last_ == '크' or l_last_ == '트') and (r_first[0] == 'ㅇ') and (r_first[1] == 'ㅓ'):
        l = stem[:-1] + compose(l_last[0], r_first[1], r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용
    if ending[:2] == '어라' or ending[:2] == '아라':
        if l_last[1] == 'ㅏ':            
            r = '거' + ending[1:]
        elif l_last[1] == 'ㅗ':
            r = '너' + ending[1:]
        else:
            r = ending
        candidates.add(stem + r)

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다
    if l_last_ == '르' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ':
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        candidates.add(stem + r)

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    if l_last_ == '하' and r_first[0] == 'ㅇ' and (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        # case 1
        r = compose(r_first[0], 'ㅕ', r_first[2]) + ending[1:]
        candidates.add(stem + r)
        # case 2
        l = stem[:-1] + compose('ㅎ', 'ㅐ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다 / 동그랗 + ㄴ -> 동그란
    if l_last[2] == 'ㅎ' and l_last_ != '좋' and not (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        if r_first[1] == ' ':
            l = l = stem[:-1] + compose(l_last[0], l_last[1], r_first[0])
        else:
            l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        if r_first_ == '으':
            r = ending[1:]
        elif r_first[1] == ' ':            
            r = ''
        else:
            r = ending
        candidates.add(l + r)

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if l_last[2] == 'ㅎ' and l_last_ != '좋' and (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        l = stem[:-1] + compose(l_last[0], 'ㅐ' if r_first[1] == 'ㅏ' else 'ㅔ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음
    if l_last[2] == 'ㅎ' and r_first[0] == 'ㄴ' and r_first[1] != ' ':
        candidates.add(stem + ending)

    # 이었 -> 였 규칙활용
    if ending[0] == '었' and l_last[1] == 'ㅣ' and l_last[2] == ' ':
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅕ', 'ㅆ') + ending[1:])

    if not candidates and r_first[1] != ' ':
        candidates.add(stem + ending)

    return candidates

예제 #20

0

파일 보기

파일: _conjugation.py 프로젝트: songys/soynlp

def _conjugate_stem(stem):

    l_len = len(stem)
    l_last = decompose(stem[-1])
    l_last_ = stem[-1]

    candidates = {stem}

    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ':
        l = stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        candidates.add(l)

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if (l_last_ == '르') and l_len >= 2:
        c0, c1, c2 = decompose(stem[-2])
        l = stem[:-2] + compose(c0, c1, 'ㄹ')
        candidates.add(l)

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ'):
        l = stem[:-1] + compose(l_last[0], l_last[1], ' ')
        candidates.add(l)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if l_last[2] == ' ':
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㄴ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㅂ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㅆ'))

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어
    if (l_last[2] == 'ㅅ') and stem[-1] != '벗':
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], ' '))

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ':
        if l_last_ == '푸':
            l = '퍼'
        else:
            candidates.add(stem[:-1] + compose(l_last[0], 'ㅝ', ' '))
            candidates.add(stem[:-1] + compose(l_last[0], 'ㅝ', 'ㅆ'))

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ':
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅘ', ' '))
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅘ', 'ㅆ'))

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if (l_last_ == '끄' or l_last_ == '크' or l_last_ == '트'):
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅓ', ' '))
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅓ', 'ㅆ'))

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    # 하 + 았다 -> 했다
    if l_last_ == '하':
        candidates.add(stem[:-1] + '해')
        candidates.add(stem[:-1] + '했')

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다 / 동그랗 + ㄴ -> 동그란
    if l_last[2] == 'ㅎ' and l_last_ != '좋':
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], ' '))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㄴ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㄹ'))
        # candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㅂ'))
        candidates.add(stem[:-1] + compose(l_last[0], l_last[1], 'ㅆ'))

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if l_last[2] == 'ㅎ' and l_last_ != '좋':
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅐ', 'ㅆ'))
        # candidates.add(stem[:-1] + compose(l_last[0], 'ㅔ', 'ㅆ'))

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음

    # 이었 -> 였 규칙활용
    if l_last[1] == 'ㅣ' and l_last[2] == ' ':
        candidates.add(stem[:-1] + compose(l_last[0], 'ㅕ', 'ㅆ'))

    return candidates

예제 #21

0

파일 보기

    def _candidates(self, l, r):
        candidates = {(l, r)}

        l_last = decompose(l[-1])
        l_last_ = compose(l_last[0], l_last[1], ' ')
        r_first = decompose(r[0]) if r else ('', '', '')
        r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '

        # ㄷ 불규칙 활용: 깨달 + 아 -> 깨닫 + 아
        if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㄷ')
            candidates.add((l_root, r))

        # 르 불규칙 활용: 굴 + 러 -> 구르 + 어
        if (l_last[2] == 'ㄹ') and (r_first_ == '러' or r_first_ == '라'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ') + '르'
            r_canon = compose('ㅇ', r_first[1], r_first[2]) + r[1:]
            candidates.add((l_root, r_canon))

        # ㅂ 불규칙 활용: 더러 + 워서 -> 더럽 + 어서
        if (l_last[2] == ' ') and (r_first_ == '워' or r_first_ == '와'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅂ')
            r_canon = compose('ㅇ', 'ㅏ' if r_first_ == '와' else 'ㅓ',
                              r_first[2]) + r[1:]
            candidates.add((l_root, r_canon))

#         # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅅ)
#         # 입 + 니다 -> 이 + ㅂ니다
        if l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ' or l_last[2] == 'ㅂ' or l_last[
                2] == 'ㅆ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ')
            r_canon = l_last[2] + r
            candidates.add((l_root, r_canon))

#         # ㅅ 불규칙 활용: 부 + 어 -> 붓 + 어
#         # exception : 벗 + 어 -> 벗어
        if (l_last[2] == ' ' and l[-1] != '벗') and (r_first[0] == 'ㅇ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅅ')
            candidates.add((l_root, r))

        # 우 불규칙 활용: 똥퍼 + '' -> 똥푸 + 어
        if l_last_ == '퍼':
            l_root = l[:-1] + '푸'
            r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 우 불규칙 활용: 줬 + 어 -> 주 + 었어
        if l_last[1] == 'ㅝ':
            l_root = l[:-1] + compose(l_last[0], 'ㅜ', ' ')
            r_canon = compose('ㅇ', 'ㅓ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 오 불규칙 활용: 왔 + 어 -> 오 + 았어
        if l_last[1] == 'ㅘ':
            l_root = l[:-1] + compose(l_last[0], 'ㅗ', ' ')
            r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄 + 어 / 텄 + 어 -> 트 + 었어
        if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
            l_root = l[:-1] + compose(l_last[0], 'ㅡ', ' ')
            r_canon = compose('ㅇ', l_last[1], l_last[2]) + r
            candidates.add((l_root, r_canon))

        # 거라, 너라 불규칙 활용
        # '-거라/-너라'를 어미로 취급하면 규칙 활용
        # if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
        #    # TODO

        # 러 불규칙 활용: 이르 + 러 -> 이르다
        # if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
        #     if self.is_root(l):
        #         # TODO

        # 여 불규칙 활용
        # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 취급하면 규칙 활용

        # 여 불규칙 활용 (2)
        # 했 + 다 -> 하 + 았다 / 해 + 라니깐 -> 하 + 아라니깐 / 했 + 었다 -> 하 + 았었다
        if l_last[0] == 'ㅎ' and l_last[1] == 'ㅐ':
            l_root = l[:-1] + '하'
            r_canon = compose('ㅇ', 'ㅏ', l_last[2]) + r
            candidates.add((l_root, r_canon))

        # ㅎ (탈락) 불규칙 활용
        # 파라 + 면 -> 파랗 + 면
        if (l_last[2] == ' ' or l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ'
                or l_last[2] == 'ㅂ' or l_last[2] == 'ㅆ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅎ')
            r_canon = r if l_last[2] == ' ' else l_last[2] + r
            candidates.add((l_root, r_canon))

        # ㅎ (축약) 불규칙 할용
        # 시퍼렜 + 다 -> 시퍼렇 + 었다, 파랬 + 다 -> 파랗 + 았다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            # exception : 그렇 + 아 -> 그래
            if len(l) >= 2 and l[-2] == '그' and l_last[0] == 'ㄹ':
                l_root = l[:-1] + '렇'
            else:
                l_root = l[:-1] + compose(
                    l_last[0], 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            r_canon = compose('ㅇ', 'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ',
                              l_last[2]) + r
            candidates.add((l_root, r_canon))

        ## Pre-defined set
        if (l, r) in self._predefined:
            for root in self._predefined[(l, r)]:
                candidates.add(root)

        return candidates

예제 #22

0

파일 보기

파일: _lemmatizer.py 프로젝트: parksjin01/soynlp-2.7

    def _candidates(self, l, r):
        candidates = set()
        if self.is_root(l):
            candidates.add(l + '다')

        l_last = decompose(l[-1])
        l_last_ = compose(l_last[0], l_last[1], ' ')
        r_first = decompose(r[0]) if r else ('', '', '')
        r_first_ = compose(r_first[0], r_first[1], ' ') if r else ' '

        ## 1. 어간이 바뀌는 불규칙 활용
        # 1.1. ㄷ 불규칙 활용: 깨닫 + 아 -> 깨달아
        if l_last[2] == 'ㄹ' and r_first[0] == 'ㅇ':
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㄷ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.2. 르 불규칙 활용: 굴 + 러 -> 구르다
        if (l_last[2] == 'ㄹ') and (r_first_ == '러' or (r_first_ == '라')):
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ') + '르'
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.3. ㅂ 불규칙 활용: 더러 + 워서 -> 더럽다
        if (l_last[2] == ' ') and (r_first_ == '워'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅂ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.3. ㅂ 불규칙 활용: 도 + 왔다 -> 돕다
        if (l == '도' or l == '고') and (r_first_ == '와'):
            l_root = compose(l_last[0], l_last[1], 'ㅂ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.3. (추가) ㅂ 추가 불규칙: 입 + 니다 -> 이다, 합 + 니다 -> 하다
        if l_last[2] == 'ㅂ':
            l_root = compose(l_last[0], l_last[1], ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.4. ㅅ 불규칙 활용: 부 + 었다 -> 붓다
        if (l_last[2] == ' ') and (r_first[0] == 'ㅇ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅅ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.5. 우 불규칙 활용: 똥퍼 + '' -> 똥푸다
        if l_last_ == '퍼':
            l_root = l[:-1] + '푸'
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.5. 우 불규칙 활용: 줬 + 어 -> 주다
        if l_last[1] == 'ㅝ':
            l_root = l[:-1] + compose(l_last[0], 'ㅜ', ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 1.6. ㅡ 탈락 불규칙 활용: 꺼 + '' -> 끄다 / 텄 + 어 -> 트다
        if (l_last[1] == 'ㅓ' or l_last[1] == 'ㅏ'):
            l_root = l[:-1] + compose(l_last[0], 'ㅡ', ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        ## 2. 어미가 바뀌는 불규칙 활용
        # 2.1. 거라 불규칙 활용
        if (l[-1] == '가') and (r and (r[0] == '라' or r[:2] == '거라')):
            candidates.add(l + '다')

        # 2.2. 너라 불규칙 활용
        # 2.2.1: 규칙활용: 돌아오 + 너라 -> 돌아오다, 돌아오 + 라고 -> 돌아오다
        # 2.2.2: 돌아 + 왔다 -> 돌아오다
        if (l_last[1] == 'ㅘ'):
            l_root = l[:-1] + compose(l_last[0], 'ㅗ', ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 2.3. 러 불규칙 활용: 이르 + 러 -> 이르다
        if (r_first[0] == 'ㄹ' and r_first[1] == 'ㅓ'):
            if self.is_root(l):
                candidates.add(l + '다')

        # 2.4. 여 불규칙 활용
        # 하 + 였다 -> 하 + 았다 -> 하다: '였다'를 어미로 넣으면 되는 문제

        # 2.5. 오 불규칙 활용
        # 달 + 아라 -> 다오, 걸 + 어라 -> 거오: 문어체적 표현에 자주 등장하며 구어체에서는 거의 없음
        # 생략

        ## 3. 어간과 어미가 모두 바뀌는 불규칙 활용
        # 3.1. ㅎ 불규칙 활용
        # 3.1.1: 파라 + 면 -> 파랗다
        if (l_last[2] == ' '):
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅎ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # 3.1.2. 시퍼렜 + 다 -> 시퍼렇다, 파랬 + 다 -> 파랗다, 파래 + '' -> 파랗다
        if (l_last[1] == 'ㅐ') or (l_last[1] == 'ㅔ'):
            l_root = l[:-1] + compose(l_last[0],
                                      'ㅓ' if l_last[1] == 'ㅔ' else 'ㅏ', 'ㅎ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        # (추가) 3.2 어미가 ㄴ인 경우: 간 + '' -> 가다, 푸른 + '' -> 푸르다,
        # 한 + '' -> 하다, 이른 + '' -> 이르다
        if (not r) and (l_last[2] == 'ㄴ' or l_last[2] == 'ㄹ'):
            l_root = l[:-1] + compose(l_last[0], l_last[1], ' ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')
            # 노란 -> 노랗다
            l_root = l[:-1] + compose(l_last[0], l_last[1], 'ㅎ')
            if self.is_root(l_root):
                candidates.add(l_root + '다')

        ## Pre-defined set
        if l + r in self._predefined:
            for root in self._predefined[l + r]:
                candidates.add(root)

        return candidates

예제 #23

0

파일 보기

파일: _conjugation.py 프로젝트: tobby2002/soynlp

def conjugate(root, ending):

    assert ending # ending must be inserted

    l_len = len(root)
    l_last = decompose(root[-1])
    l_last_ = root[-1]
    r_first = decompose(ending[0])
    r_first_ = compose(r_first[0], r_first[1], ' ') if r_first[1] != ' ' else ending[0]

    candidates = set()
    
    # ㄷ 불규칙 활용: 깨달 + 아 -> 깨달아
    if l_last[2] == 'ㄷ' and r_first[0] == 'ㅇ':
        l = root[:-1] + compose(l_last[0], l_last[1], 'ㄹ')
        candidates.add(l + ending)

    # 르 불규칙 활용: 구르 + 어 -> 굴러
    if (l_last_ == '르') and (r_first_ == '아' or r_first_ == '어') and l_len >= 2:
        c0, c1, c2 = decompose(root[-2])
        l = root[:-2] + compose(c0, c1, 'ㄹ')
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        candidates.add(l + r)

    # ㅂ 불규칙 활용:
    # (모음조화) 더럽 + 어 -> 더러워 / 곱 + 아 -> 고와 
    # (모음조화가 깨진 경우) 아름답 + 아 -> 아름다워 / (-답, -꼽, -깝, -롭)
    if (l_last[2] == 'ㅂ') and (r_first_ == '어' or r_first_ == '아'):
        l = root[:-1] + compose(l_last[0], l_last[1], ' ')
        if l_len >= 2 and (l_last_ == '답' or l_last_ == '곱' or l_last_ == '깝' or l_last_ == '롭'):
            c1 = 'ㅝ'
        elif r_first[1] == 'ㅗ':
            c1 = 'ㅘ'
        elif r_first[1] == 'ㅜ':
            c1 = 'ㅝ'
        elif r_first_ == '어':
            c1 = 'ㅝ'
        else: # r_first_ == '아'
            c1 = 'ㅘ'
        r = compose('ㅇ', c1, r_first[2]) + ending[1:]
        candidates.add(l + r)

    # 어미의 첫글자가 종성일 경우 (-ㄴ, -ㄹ, -ㅂ, -ㅆ)
    # 이 + ㅂ니다 -> 입니다
    if l_last[2] == ' ' and r_first[1] == ' ' and (r_first[0] == 'ㄴ' or r_first[0] == 'ㄹ' or r_first[0] == 'ㅂ' or r_first[0] == 'ㅆ'):
        l = root[:-1] + compose(l_last[0], l_last[1], r_first[0])
        r = ending[1:]
        candidates.add(l + r)

    # ㅅ 불규칙 활용: 붓 + 어 -> 부어
    # exception : 벗 + 어 -> 벗어    
    if (l_last[2] == 'ㅅ') and (r_first[0] == 'ㅇ'):
        if root[-1] == '벗':
            l = root
        else:
            l = root[:-1] + compose(l_last[0], l_last[1], ' ')
        candidates.add(l + ending)

    # 우 불규칙 활용: 푸 + 어 -> 퍼 / 주 + 어 -> 줘
    if l_last[1] == 'ㅜ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ':
        if l_last_ == '푸':
            l = '퍼'
        else:
            l = root[:-1] + compose(l_last[0], 'ㅝ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # 오 활용: 오 + 았어 -> 왔어
    if l_last[1] == 'ㅗ' and l_last[2] == ' ' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅏ':
        l = root[:-1] + compose(l_last[0], 'ㅘ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅡ 탈락 불규칙 활용: 끄 + 어 -> 꺼 / 트 + 었다 -> 텄다
    if (l_last_ == '끄' or l_last_ == '크' or l_last_ == '트') and (r_first[0] == 'ㅇ') and (r_first[1] == 'ㅓ'):
        l = root[:-1] + compose(l_last[0], r_first[1], r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # 거라, 너라 불규칙 활용
    # '-거라/-너라'를 어미로 취급하면 규칙 활용
    if ending[:2] == '어라' or ending[:2] == '아라':
        if l_last[1] == 'ㅏ':            
            r = '거' + ending[1:]
        elif l_last[1] == 'ㅗ':
            r = '너' + ending[1:]
        else:
            r = ending
        candidates.add(root + r)

    # 러 불규칙 활용: 이르 + 어 -> 이르러 / 이르 + 었다 -> 이르렀다
    if l_last_ == '르' and r_first[0] == 'ㅇ' and r_first[1] == 'ㅓ':
        r = compose('ㄹ', r_first[1], r_first[2]) + ending[1:]
        candidates.add(root + r)

    # 여 불규칙 활용
    # 하 + 았다 -> 하였다 / 하 + 었다 -> 하였다
    if l_last_ == '하' and r_first[0] == 'ㅇ' and (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        # case 1
        r = compose(r_first[0], 'ㅕ', r_first[2]) + ending[1:]
        candidates.add(root + r)
        # case 2
        l = root[:-1] + compose('ㅎ', 'ㅐ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅎ (탈락) 불규칙 활용
    # 파라 + 면 -> 파랗다 / 동그랗 + ㄴ -> 동그란
    if l_last[2] == 'ㅎ' and l_last_ != '좋' and not (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        if r_first[1] == ' ':
            l = l = root[:-1] + compose(l_last[0], l_last[1], r_first[0])
        else:
            l = root[:-1] + compose(l_last[0], l_last[1], ' ')
        if r_first_ == '으':
            r = ending[1:]
        elif r_first[1] == ' ':            
            r = ''
        else:
            r = ending
        candidates.add(l + r)

    # ㅎ (축약) 불규칙 할용
    # 파랗 + 았다 -> 파랬다 / 시퍼렇 + 었다 -> 시퍼렜다
    if l_last[2] == 'ㅎ' and l_last_ != '좋' and (r_first[1] == 'ㅏ' or r_first[1] == 'ㅓ'):
        l = root[:-1] + compose(l_last[0], 'ㅐ' if r_first[1] == 'ㅏ' else 'ㅔ', r_first[2])
        r = ending[1:]
        candidates.add(l + r)

    # ㅎ + 네 불규칙 활용
    # ㅎ 탈락과 ㅎ 유지 모두 맞음
    if l_last[2] == 'ㅎ' and r_first[0] == 'ㄴ' and r_first[1] != ' ':
        candidates.add(root + ending)

    if not candidates and r_first[1] != ' ':
        candidates.add(root + ending)

    return candidates