示例#1
0
def test_add_rule():
    kiwi = Kiwi()
    ores, oscore = kiwi.analyze("했어요! 하잖아요! 할까요?")[0]

    assert len(kiwi.add_re_rule("EF", r"요$", "용", score=0)) > 0
    res, score = kiwi.analyze("했어용! 하잖아용! 할까용?")[0]
    assert score == oscore

    kiwi = Kiwi()
    assert len(kiwi.add_re_rule("EF", r"요$", "용", score=-1)) > 0
    res, score = kiwi.analyze("했어용! 하잖아용! 할까용?")[0]
    assert score == oscore - 3
示例#2
0
def test_analyze_single():
    kiwi = Kiwi()
    for line in open(curpath + '/test_corpus/constitution.txt',
                     encoding='utf-8'):
        toks, score = kiwi.analyze(line)[0]
    for t in toks:
        print(t.form, t.tag, t.start, t.end, t.len, t.id, t.base_form,
              t.base_id)
        break
def index(request):
    context = {}

    # History 경로 생성
    homepath = os.path.expanduser("~")
    abs_chrome_path = os.path.join(homepath, 'AppData', 'Local', 'Google', 'Chrome', 'User Data', 'Default', 'History')
    # History 파일 복사
    shutil.copyfile(abs_chrome_path, abs_chrome_path+"_sample")
    # 복사본 데이터 추출
    con = sqlite3.connect(abs_chrome_path+"_sample")
    cursor = con.cursor()
    cursor.execute("SELECT term FROM keyword_search_terms")
    term_data = cursor.fetchall()

    # 형태소 분석
    kiwi = Kiwi()
    kiwi.prepare()
    word_list = []
    for term in term_data:
        for word, tag, _, _ in kiwi.analyze(term[0], top_n=1)[0][0]:
            if tag in ['NNG','NNP','NNB','SL']:
                word_list.append(word)
    
    # count
    counts = Counter(word_list)
    tags = counts.most_common()

    # wordcloud
    mask = plt.imread("./static/images/mask.jpg")
    wc = WordCloud(font_path='./static/webfonts/NanumBarunGothicBold.ttf',
                    background_color='white', 
                    width=800, 
                    height=800,
                    mask=mask)

    cloud = wc.generate_from_frequencies(dict(tags))
    plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud,  interpolation="bilinear")
    plt.savefig("./static/images/wordcloud_keyword.png", dpi=300, bbox_inches='tight')

    # 상위 9개 단어
    
    top9_list = []
    for rank in range(9):
        top9 = {}
        top9['rank'] = rank+1
        top9['word'] = tags[rank][0]
        top9['count'] = tags[rank][1]
        top9_list.append(top9)

    context['top9'] = top9_list

    return render(request, 'mainapp/index.html', context)
示例#4
0
def test_kiwi(environ):
    _, tagger = environ
    kiwi = Kiwi()
    kiwi.prepare()

    for _, line in EXAMPLES:
        res1 = tagger.tagSentence(line)[0]
        res2 = kiwi.analyze(line)

        res1 = [(m.surface, m.originalTag) for w in res1 for m in w]
        res2 = [m[:2] for m in res2[0][0]]

        assert res1 == res2
示例#5
0
def test_stopwords():
    kiwi = Kiwi()
    tokens, _ = kiwi.analyze('불용어 처리 테스트 중입니다 '
                             '우리는 강아지를 좋아한다 쟤도 강아지를 좋아한다 '
                             '지금은 2021년 11월이다.')[0]
    stopwords = Stopwords()
    print(set(tokens) - set(stopwords.filter(tokens)))
    filename = curpath + '/test_corpus/custom_stopwords.txt'
    stopwords = Stopwords(filename)

    stopwords.add(('강아지', 'NNP'))
    assert (('강아지', 'NNP') in stopwords) == True

    stopwords.remove(('강아지', 'NNP'))
    assert (('강아지', 'NNP') in stopwords) == False
    print(set(tokens) - set(stopwords.filter(tokens)))
示例#6
0
class kiwi_dictionary_n_fuction:
    def __init__(self, path):
        self.kiwi = Kiwi(options=Option.LOAD_DEFAULT_DICTIONARY
                         | Option.INTEGRATE_ALLOMORPH)
        self.kiwi.load_user_dictionary(path)
        self.kiwi.prepare()

        self.josa = [
            'JK', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC'
        ]

    def get_noun(self, sen):
        _, self.nn_list, _, _ = self.generate_morp_word(sen, 1)
        return self.nn_list

    # 문장 전체를 리스트형태로 띄어쓰기만 해서 리턴
    def get_all_token(self, sen):
        morp_list, _, _, _ = self.generate_morp_word(sen, 1)
        return morp_list

    # 문장 전체를 토큰화 후 문자열 리턴
    def get_token_str(self, sen):
        morp_list, _, _, _ = self.generate_morp_word(sen, 1)
        string = ''.join(morp_list)
        # if '\\' in self.string:
        #     self.string = self.string.translate({ord('\\'):'\\\\'})
        return string

    def get_vv(self, sen):
        _, _, vv_list, _ = self.generate_morp_word(sen, 1)
        return vv_list

    def get_nn_list(self, sen):
        _, nn_list, _, _ = self.generate_morp_word(sen, 1)
        return nn_list

    # 조사 없애고 나머지부분 문자열형태로 리턴.
    def get_no_josa_token(
        self, sen
    ):  # EX) 관찰 가능 하 고 처리 가능 하 ᆫ 범위 내 문장 입력 받 어 정해진 형태 출력 제한 되 ᆫ 시간 내 출력 하 어야 하 ᆫ다는 제약 적 용도 고려 하 ᆫ 관점 이 다 .
        _, _, _, nosa_list = self.generate_morp_word(sen, 1)
        string = ''.join(nosa_list)
        return string

    # 튜플 리스트 리턴
    def k_pos(
        self, sentence
    ):  # [('관찰', 'NNG'), ('가능', 'NNG'), ('하', 'XSA'), ('고', 'EC'), ('처리', 'NNG'), ('가능', 'NNG'), ('하', 'XSA'), ('ᆫ', 'ETM'), ('범위', 'NNG')]
        tuple_list = []
        result = self.kiwi.analyze(sentence, 1)
        for i in result[0][0]:
            word, pos = i[0], i[1]
            new_tuple = (word, pos)
            tuple_list.append(new_tuple)
        return tuple_list

    def k_analyze(self, sentence):
        return self.kiwi.analyze(sentence, 1)

    # 단순 단어만 리스트형태로 리턴
    def k_morphs(self, sen):  # ['관찰', '가능', '하', '고', '처리', '가능', '하' ... ..]
        token_list = []
        result = self.kiwi.analyze(sen, 1)
        for i in result[0][0]:
            token_list.append(i[0])
        return token_list

    # 문장에서 형태소를 뽑아냄
    def generate_morp_word(self, sentence, analyze_num):
        try:
            result = self.kiwi.analyze(sentence, analyze_num)
            morp_word_list = []
            morp_nn_list = []
            morp_vv_list = []
            morp_not_josa_list = []
            for i in range(0, analyze_num):
                morp_word = ''
                morp_nn = ''
                morp_vv = ''
                morp_not_josa = ''
                nn = []
                for word in result[i][0]:
                    morp_word += word[0]
                    morp_word += ' '

                    if word[1] not in self.josa:
                        morp_not_josa += word[0]
                        morp_not_josa += ' '
                        if word[1] in ['NNG', 'NNP', 'NNB', 'NP', 'NR', 'SL']:
                            morp_nn += word[0]
                            morp_nn += ' '
                            nn.append(word[0])
                        elif word[1] in ['VV', 'VA', 'VX', 'VCP', 'VCN']:
                            morp_vv += word[0]
                            morp_vv += ' '
                    else:
                        pass
                morp_word_list.append(morp_word)
                morp_nn_list.append(morp_nn)
                morp_vv_list.append(morp_vv)
                morp_not_josa_list.append(morp_not_josa)

            return morp_word_list, morp_nn_list, morp_vv_list, morp_not_josa_list

        except Exception as e:
            print(e)
            print("### ERROR 형태소 분석기 부분 에 뭐가 잘못된게 있는듯 ERROR ### ")

    def __del__(self):
        print("EXIT kiwi")
示例#7
0
def test_bug_38():
    text = "이 예쁜 꽃은 독을 품었지만 진짜 아름다움을 가지고 있어요"
    kiwi = Kiwi(integrate_allomorph=True)
    print(kiwi.analyze(text))
    kiwi = Kiwi(integrate_allomorph=False)
    print(kiwi.analyze(text))
示例#8
0
def test_bug_33():
    kiwi = Kiwi()
    kiwi.add_user_word('김갑갑', 'NNP')
    print(kiwi.analyze("김갑갑 김갑갑 김갑갑"))
示例#9
0
def test_new_analyze_multi():
    kiwi = Kiwi()
    for res in kiwi.analyze(
            open(curpath + '/test_corpus/constitution.txt', encoding='utf-8')):
        pass
示例#10
0
def test_tweet():
    kiwi = Kiwi()
    kiwi.analyze('''#바둑#장기#오목 귀요미#보드판🐥
#어린이임블리의 놀이였는데, 이제는 가물갸물🙄모르겠
장이요~멍이요~ㅎㅎㅎ다시 한 번 재미를 붙여 보까ㅎ
할 일이 태산인데😭, 하고 싶은건 무궁무진🤦‍♀️ 큰 일이다''')
示例#11
0
class Changer(object):
    def __init__(self):    
        try:
            self.kiwi = Kiwi()
            self.kiwi.prepare()
        except:
            print("[INFO] please install kiwipiepy   ")
            
        self.replace = formaldic()
        self.utils = Utils()

    def dechanger(self, stc):
        """
        change formal speech to informal
        Args : str
        """
        pattern = r'하세요|이예요|이에요|에요|예요|시겠어요|죠|합니까|습니까'
        pattern = re.compile(pattern)

        result = []


        stc = self.utils._remove_blank(stc)
        stc = self.utils._clean_up_tokenization(stc)

        if len(re.findall(pattern, stc)) > 0:
            tokens = self.kiwi.analyze(stc.replace(" ","|"))
            
            key = informaldic().keys()
            lk = list(key)
            key2 = abnormaldic().keys()
            ak = list(key2)
            
            tmp = []
            for token in tokens[0][0]:
                if token[:2] in lk:
                    #key로 value
                    token = informaldic().get(token[:2])
                if token[:2] in ak:
                    token = abnormaldic().get(token[:2])
                tmp.append(token)

            changed = ''
            for t in tmp:
                if isinstance(t[0], tuple):
                    for i in range(len(t[0])):
                        changed += hgtk.text.decompose(t[i][0])
                else:
                    changed += hgtk.text.decompose(t[0])
                    
            one_char = re.compile('ᴥ[ㅂㄴㄹ]ᴥ')
            if one_char.search(changed):
                words = changed.split('ᴥ')
                for idx in range(1,len(words)):
                    # 앞 글자가 종성이 없음
                    if len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 2:
                        #앞 글자에 합침
                        words[idx - 1] = words[idx-1]+words[idx]
                        words[idx] = ""
                    # 있음
                    elif len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 3:
                        shp = ['ㅆ','ㅍ','ㄱ','ㅄ','ㄶ']
                        ep = ['ㄹ']
                        if words[idx] == 'ㅂ' and len(words[idx - 1].replace('|', "")) == 3 :
                            if words[idx - 1][-1] in shp :
                                if words[idx].count("|") > 0:
                                    words[idx] = "|습"
                                else:
                                    words[idx ] = "습"
                                continue
                            else :
                                if words[idx].count("|") > 0:
                                    words[idx] = "|입"
                                else:
                                    words[idx] = "입"
                                # words[idx] = ""
                        elif words[idx] =='ㄴ' and len(words[idx-1].replace('|',"")) == 3 and words[idx - 1].endswith('ㄹ'):
                            if words[idx-1].count("|") >0 :
                                words[idx - 1] = "|" + words[idx - 1].replace("|","")[:2] + words[idx]
                            else :
                                words[idx - 1] = words[idx - 1][:2] + words[idx]
                            # 지움
                            words[idx] = ""
                        elif words[idx] =='ㄹ':
                            if words[idx].count("|") > 0:
                                words[idx] = "|일"
                            else:
                                words[idx] = "일"

                changed = "ᴥ".join([x for x in words if x is not ""])+"ᴥ"
            # For cases which wasn't covered,
            changed = self._makePretty(changed)
            changed = hgtk.text.compose(changed).replace("|"," ")
            # excetion 처리
            try:
                if changed[-1] == '요':
                    changed = re.sub('요', '', changed)
                changed = re.sub('그렇죠', '', changed)
            except:
                pass
            result.append(changed)

        else:
            try:
                result.append(stc)
            except:
                pass
        return result[0]
        

    def _makePretty(self, line):
        """
        Convert the jaso orderings which wasn't properly covered by
        Jaso restructuring process of function Mal_Gillge_Haeraing
        :param line: jaso orderings which wasn't properly covered
        :return: Converted jaso ordering
        """
        test = line
        test = test.replace("ᴥㅎㅏᴥㅇㅏᴥ", "ᴥㅎㅐᴥ")
        test = test.replace("ㅎㅏᴥㅇㅏᴥㅇㅛᴥ", "ᴥㅎㅐᴥ")
        test = test.replace("ㅎㅏᴥㄴㅣᴥㄷㅏᴥ", "ㅎㅏㅂᴥㄴㅣᴥㄷㅏᴥ")
        test = test.replace("ㅎㅏᴥㅇㅏㅆᴥ", "ᴥㅎㅐㅆᴥ")
        test = test.replace("ㄴㅏᴥㅇㅏㅆᴥ", "ᴥㅎㅐㅆᴥ")
        test = test.replace("ㄱㅏᴥㅇㅏㅆᴥ", "ᴥㄱㅏㅆᴥ")
        test = test.replace("ㅇㅣᴥㄴㅣᴥ", "ᴥㄴㅣᴥ")
        test = test.replace("ㄴㅓㄹㄴᴥ","ㄴㅓㄴᴥ")
        test = test.replace("ㄱㅡᴥㄹㅓㅎᴥㅇㅓᴥ","ㄱㅡᴥㄹㅐᴥ")
        test = test.replace("ㅡᴥㅇㅏᴥ","ㅏᴥ")
        test = test.replace("ㄱㅓㄹᴥㄴㅏᴥㅇㅛᴥ", "ㄱㅓㄴᴥㄱㅏᴥㅇㅛᴥ")
        return test

    def changer(self, text):
        """
        change informal speech to formal speech
        Args : str
        """
        tokens = self.kiwi.analyze(text.replace(" ","|"))
        
        key = formaldic().keys()
        key2 = abnormaldic().keys()
        lk = list(key)
        ak = list(key2)
        
        num = len(tokens[0][0])
        result = []
        for idx, token in enumerate(tokens[0][0]):
            if idx > int(num*0.8):        
                if token[:2] in lk:
                    #key로 value
                    token = formaldic().get(token[:2])
                    result.append(token)
                else:
                    if token[:2] in ak:
                        token = abnormaldic().get(token[:2])
                        result.append(token)
                    else:
                        result.append(token[:2])
            else:
                if token[:2] in ak:
                    token = abnormaldic().get(token[:2])
                    result.append(token)
                else:
                    result.append(token[:2])
                
        # change tuple to text
        changed = ''
        for t in result:
            if isinstance(t[0], tuple):
                for i in range(len(t[0])):
                    changed += hgtk.text.decompose(t[i][0])
            else:
                changed += hgtk.text.decompose(t[0])

        # Restructuring sentence from jaso ordering.
        one_char = re.compile('ᴥ[ㅂㄴㄹ]ᴥ')
        if one_char.search(changed):
            words = changed.split('ᴥ')
            for idx in range(1,len(words)):
                # 앞 글자가 종성이 없음
                if len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 2:
                    #앞 글자에 합침
                    words[idx - 1] = words[idx-1]+words[idx]
                    words[idx] = ""
                # 있음
                elif len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 3:
                    shp = ['ㅆ','ㅍ','ㄱ','ㅄ','ㄶ']
                    ep = ['ㄹ']
                    if words[idx] == 'ㅂ' and len(words[idx - 1].replace('|', "")) == 3 :
                        if words[idx - 1][-1] in shp :
                            if words[idx].count("|") > 0:
                                words[idx] = "|습"
                            else:
                                words[idx ] = "습"
                            continue
                        else :
                            if words[idx].count("|") > 0:
                                words[idx] = "|입"
                            else:
                                words[idx] = "입"
                            # words[idx] = ""
                    elif words[idx] =='ㄴ' and len(words[idx-1].replace('|',"")) == 3 and words[idx - 1].endswith('ㄹ'):
                        if words[idx-1].count("|") >0 :
                            words[idx - 1] = "|" + words[idx - 1].replace("|","")[:2] + words[idx]
                        else :
                            words[idx - 1] = words[idx - 1][:2] + words[idx]
                        # 지움
                        words[idx] = ""
                    elif words[idx] =='ㄹ':
                        if words[idx].count("|") > 0:
                            words[idx] = "|일"
                        else:
                            words[idx] = "일"

            changed = "ᴥ".join([x for x in words if x is not ""])+"ᴥ"
        # For cases which wasn't covered,
        changed = self._makePretty(changed)
        changed = hgtk.text.compose(changed).replace("|"," ")
        return changed
        
    def addData(self, key, val):
        """
        Add new data to dictionary, changer dictionary update
        :param key: key to be added into Dictionary self.replace
        :param val: Value to be added into Dictionary self.replace
        :return: None
        """
        with open('dictionary.py', 'r', encoding='utf-8') as f:
            data = f.read()

        lines = data.split("\n")
        lines[-2] += ','
        lines[-1] = "                    " + str(key) + ": " + str(val)
        with open('dictionary.py', 'w', encoding='utf-8') as f:
            for i in range(len(lines)):
                f.write(lines[i] + "\n")
            f.write("                    }")

    def checker(self, result):
        """
        Check the abnormal setnecnes and remove them.
        Args : result, updated, idx : list 
        """
        updated = []
        idxes = []
        normal = ['요', '까', '다', '죠', '가']
        for idx, stc in enumerate(result):
            try:
                if stc[-1] not in normal:
                    print(f"[INFO] Abnormal Sentence, remove {idx}....")
                    idxes.append(idx)
                else:
                    updated.append(stc)
            except:
                idxes.append(idx)

        return updated, idxes
示例#12
0
kiwi.prepare()

file_path = 'example.txt'
if len(sys.argv) > 1 and sys.argv[1]:
    file = sys.argv[1] 
file_name, ext = file_path.split('.')

valid_token_set = {'NNG', 'NNP', 'NNB'}

print('=====')

result = Counter()

with open(file_path) as f:
    text = f.read()
    tokens = kiwi.analyze(text)[0][0]
    result = Counter([
        (token[0], token[1])
        for token in tokens
        if token[1] in valid_token_set
    ])

output_path = '.'.join([file_name, 'csv'])
with open(output_path, 'w' ) as f:
    writer = csv.writer(f)
    writer.writerow(['형태소', '품사', 'count'])
    for key, value in result.items():
        writer.writerow([key[0], key[1], value])

    print(file_path, '>>', output_path, '추출 완료.')
示例#13
0
rsc = r'E:\Programming\python\창회선배스터디\Morpheme_Cloud\자료\토지2.txt'

target_corpus = codecs.open(rsc, 'r', encoding='utf-8')

#텍스트 태깅 작업
tagged_temp = []

# with open(rsc, 'r', encoding="utf8") as kr_f:
#     for line in kr_f:
#         line = line.strip()
#         tagged_temp += flat(line)

for i in target_corpus:
    i = i.strip()
    temp_tagging = [x[0] for x in tagger.analyze(i, top_n=1)]
    inner_temp = ["{}/{}".format(word, tag) for word, tag, score1, score2 in temp_tagging[0]]
    tagged_temp.append(tuple(inner_temp))

print(tagged_temp[:3])

tagged_list = []

# for i in tagged_temp:
#     if '/V' in i:
#         i = i.replace('/V','다/V')
#         tagged_list.append(tuple(i.split('/')))
#     elif '/A' in i:
#         i.replace('/A','다/A')
#         tagged_list.append (tuple (i.split ('/')))
#     else:
示例#14
0
from kiwipiepy import Kiwi, __version__
print("kiwipiepy v{}".format(__version__))
kiwi = Kiwi()
try:
    while True:
        txt = input('>>')
        res = kiwi.analyze(txt)[0]
        print(res)
except EOFError:
    pass
示例#15
0
        self.output.write(' '.join(map(lambda x:x[0]+'/'+x[1], res[0][0])) + '\n')

    def __del__(self):
        self.input.close()
        self.output.close()

kiwi = Kiwi()
kiwi.load_user_dictionary(r'./server_project/test/userDict.txt')
kiwi.add_user_word('iXVDR', 'NNP', 3.0)

kiwi.prepare()
# handle = IOHandler(r'./server_project/test/input.txt', r'./server_project/test/result.txt')
# kiwi.analyze(handle.read, handle.write)


result = kiwi.analyze('강남에서 먹었던 오늘의 스파게티는 맛있었다.', 1)
for i in result:
    print(i)


class ReaderExam:
    def __init__(self, filePath):
        self.file = open(filePath, encoding='UTF8')
    
    def read(self, id):
        if id == 0: self.file.seek(0)
        return self.file.readline()

reader = ReaderExam(r'./server_project/test/input.txt')
print(kiwi.extractWords(reader.read, 1, 10, 0.25))
#kiwi.extract_add_words(reader.read, min_cnt = 1, max_word_len = 10, min_score = 0.25, pos_score = -3)