def test_add_rule(): kiwi = Kiwi() ores, oscore = kiwi.analyze("했어요! 하잖아요! 할까요?")[0] assert len(kiwi.add_re_rule("EF", r"요$", "용", score=0)) > 0 res, score = kiwi.analyze("했어용! 하잖아용! 할까용?")[0] assert score == oscore kiwi = Kiwi() assert len(kiwi.add_re_rule("EF", r"요$", "용", score=-1)) > 0 res, score = kiwi.analyze("했어용! 하잖아용! 할까용?")[0] assert score == oscore - 3
def test_analyze_single(): kiwi = Kiwi() for line in open(curpath + '/test_corpus/constitution.txt', encoding='utf-8'): toks, score = kiwi.analyze(line)[0] for t in toks: print(t.form, t.tag, t.start, t.end, t.len, t.id, t.base_form, t.base_id) break
def index(request): context = {} # History 경로 생성 homepath = os.path.expanduser("~") abs_chrome_path = os.path.join(homepath, 'AppData', 'Local', 'Google', 'Chrome', 'User Data', 'Default', 'History') # History 파일 복사 shutil.copyfile(abs_chrome_path, abs_chrome_path+"_sample") # 복사본 데이터 추출 con = sqlite3.connect(abs_chrome_path+"_sample") cursor = con.cursor() cursor.execute("SELECT term FROM keyword_search_terms") term_data = cursor.fetchall() # 형태소 분석 kiwi = Kiwi() kiwi.prepare() word_list = [] for term in term_data: for word, tag, _, _ in kiwi.analyze(term[0], top_n=1)[0][0]: if tag in ['NNG','NNP','NNB','SL']: word_list.append(word) # count counts = Counter(word_list) tags = counts.most_common() # wordcloud mask = plt.imread("./static/images/mask.jpg") wc = WordCloud(font_path='./static/webfonts/NanumBarunGothicBold.ttf', background_color='white', width=800, height=800, mask=mask) cloud = wc.generate_from_frequencies(dict(tags)) plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud, interpolation="bilinear") plt.savefig("./static/images/wordcloud_keyword.png", dpi=300, bbox_inches='tight') # 상위 9개 단어 top9_list = [] for rank in range(9): top9 = {} top9['rank'] = rank+1 top9['word'] = tags[rank][0] top9['count'] = tags[rank][1] top9_list.append(top9) context['top9'] = top9_list return render(request, 'mainapp/index.html', context)
def test_kiwi(environ): _, tagger = environ kiwi = Kiwi() kiwi.prepare() for _, line in EXAMPLES: res1 = tagger.tagSentence(line)[0] res2 = kiwi.analyze(line) res1 = [(m.surface, m.originalTag) for w in res1 for m in w] res2 = [m[:2] for m in res2[0][0]] assert res1 == res2
def test_stopwords(): kiwi = Kiwi() tokens, _ = kiwi.analyze('불용어 처리 테스트 중입니다 ' '우리는 강아지를 좋아한다 쟤도 강아지를 좋아한다 ' '지금은 2021년 11월이다.')[0] stopwords = Stopwords() print(set(tokens) - set(stopwords.filter(tokens))) filename = curpath + '/test_corpus/custom_stopwords.txt' stopwords = Stopwords(filename) stopwords.add(('강아지', 'NNP')) assert (('강아지', 'NNP') in stopwords) == True stopwords.remove(('강아지', 'NNP')) assert (('강아지', 'NNP') in stopwords) == False print(set(tokens) - set(stopwords.filter(tokens)))
class kiwi_dictionary_n_fuction: def __init__(self, path): self.kiwi = Kiwi(options=Option.LOAD_DEFAULT_DICTIONARY | Option.INTEGRATE_ALLOMORPH) self.kiwi.load_user_dictionary(path) self.kiwi.prepare() self.josa = [ 'JK', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC' ] def get_noun(self, sen): _, self.nn_list, _, _ = self.generate_morp_word(sen, 1) return self.nn_list # 문장 전체를 리스트형태로 띄어쓰기만 해서 리턴 def get_all_token(self, sen): morp_list, _, _, _ = self.generate_morp_word(sen, 1) return morp_list # 문장 전체를 토큰화 후 문자열 리턴 def get_token_str(self, sen): morp_list, _, _, _ = self.generate_morp_word(sen, 1) string = ''.join(morp_list) # if '\\' in self.string: # self.string = self.string.translate({ord('\\'):'\\\\'}) return string def get_vv(self, sen): _, _, vv_list, _ = self.generate_morp_word(sen, 1) return vv_list def get_nn_list(self, sen): _, nn_list, _, _ = self.generate_morp_word(sen, 1) return nn_list # 조사 없애고 나머지부분 문자열형태로 리턴. def get_no_josa_token( self, sen ): # EX) 관찰 가능 하 고 처리 가능 하 ᆫ 범위 내 문장 입력 받 어 정해진 형태 출력 제한 되 ᆫ 시간 내 출력 하 어야 하 ᆫ다는 제약 적 용도 고려 하 ᆫ 관점 이 다 . _, _, _, nosa_list = self.generate_morp_word(sen, 1) string = ''.join(nosa_list) return string # 튜플 리스트 리턴 def k_pos( self, sentence ): # [('관찰', 'NNG'), ('가능', 'NNG'), ('하', 'XSA'), ('고', 'EC'), ('처리', 'NNG'), ('가능', 'NNG'), ('하', 'XSA'), ('ᆫ', 'ETM'), ('범위', 'NNG')] tuple_list = [] result = self.kiwi.analyze(sentence, 1) for i in result[0][0]: word, pos = i[0], i[1] new_tuple = (word, pos) tuple_list.append(new_tuple) return tuple_list def k_analyze(self, sentence): return self.kiwi.analyze(sentence, 1) # 단순 단어만 리스트형태로 리턴 def k_morphs(self, sen): # ['관찰', '가능', '하', '고', '처리', '가능', '하' ... ..] token_list = [] result = self.kiwi.analyze(sen, 1) for i in result[0][0]: token_list.append(i[0]) return token_list # 문장에서 형태소를 뽑아냄 def generate_morp_word(self, sentence, analyze_num): try: result = self.kiwi.analyze(sentence, analyze_num) morp_word_list = [] morp_nn_list = [] morp_vv_list = [] morp_not_josa_list = [] for i in range(0, analyze_num): morp_word = '' morp_nn = '' morp_vv = '' morp_not_josa = '' nn = [] for word in result[i][0]: morp_word += word[0] morp_word += ' ' if word[1] not in self.josa: morp_not_josa += word[0] morp_not_josa += ' ' if word[1] in ['NNG', 'NNP', 'NNB', 'NP', 'NR', 'SL']: morp_nn += word[0] morp_nn += ' ' nn.append(word[0]) elif word[1] in ['VV', 'VA', 'VX', 'VCP', 'VCN']: morp_vv += word[0] morp_vv += ' ' else: pass morp_word_list.append(morp_word) morp_nn_list.append(morp_nn) morp_vv_list.append(morp_vv) morp_not_josa_list.append(morp_not_josa) return morp_word_list, morp_nn_list, morp_vv_list, morp_not_josa_list except Exception as e: print(e) print("### ERROR 형태소 분석기 부분 에 뭐가 잘못된게 있는듯 ERROR ### ") def __del__(self): print("EXIT kiwi")
def test_bug_38(): text = "이 예쁜 꽃은 독을 품었지만 진짜 아름다움을 가지고 있어요" kiwi = Kiwi(integrate_allomorph=True) print(kiwi.analyze(text)) kiwi = Kiwi(integrate_allomorph=False) print(kiwi.analyze(text))
def test_bug_33(): kiwi = Kiwi() kiwi.add_user_word('김갑갑', 'NNP') print(kiwi.analyze("김갑갑 김갑갑 김갑갑"))
def test_new_analyze_multi(): kiwi = Kiwi() for res in kiwi.analyze( open(curpath + '/test_corpus/constitution.txt', encoding='utf-8')): pass
def test_tweet(): kiwi = Kiwi() kiwi.analyze('''#바둑#장기#오목 귀요미#보드판🐥 #어린이임블리의 놀이였는데, 이제는 가물갸물🙄모르겠 장이요~멍이요~ㅎㅎㅎ다시 한 번 재미를 붙여 보까ㅎ 할 일이 태산인데😭, 하고 싶은건 무궁무진🤦♀️ 큰 일이다''')
class Changer(object): def __init__(self): try: self.kiwi = Kiwi() self.kiwi.prepare() except: print("[INFO] please install kiwipiepy ") self.replace = formaldic() self.utils = Utils() def dechanger(self, stc): """ change formal speech to informal Args : str """ pattern = r'하세요|이예요|이에요|에요|예요|시겠어요|죠|합니까|습니까' pattern = re.compile(pattern) result = [] stc = self.utils._remove_blank(stc) stc = self.utils._clean_up_tokenization(stc) if len(re.findall(pattern, stc)) > 0: tokens = self.kiwi.analyze(stc.replace(" ","|")) key = informaldic().keys() lk = list(key) key2 = abnormaldic().keys() ak = list(key2) tmp = [] for token in tokens[0][0]: if token[:2] in lk: #key로 value token = informaldic().get(token[:2]) if token[:2] in ak: token = abnormaldic().get(token[:2]) tmp.append(token) changed = '' for t in tmp: if isinstance(t[0], tuple): for i in range(len(t[0])): changed += hgtk.text.decompose(t[i][0]) else: changed += hgtk.text.decompose(t[0]) one_char = re.compile('ᴥ[ㅂㄴㄹ]ᴥ') if one_char.search(changed): words = changed.split('ᴥ') for idx in range(1,len(words)): # 앞 글자가 종성이 없음 if len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 2: #앞 글자에 합침 words[idx - 1] = words[idx-1]+words[idx] words[idx] = "" # 있음 elif len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 3: shp = ['ㅆ','ㅍ','ㄱ','ㅄ','ㄶ'] ep = ['ㄹ'] if words[idx] == 'ㅂ' and len(words[idx - 1].replace('|', "")) == 3 : if words[idx - 1][-1] in shp : if words[idx].count("|") > 0: words[idx] = "|습" else: words[idx ] = "습" continue else : if words[idx].count("|") > 0: words[idx] = "|입" else: words[idx] = "입" # words[idx] = "" elif words[idx] =='ㄴ' and len(words[idx-1].replace('|',"")) == 3 and words[idx - 1].endswith('ㄹ'): if words[idx-1].count("|") >0 : words[idx - 1] = "|" + words[idx - 1].replace("|","")[:2] + words[idx] else : words[idx - 1] = words[idx - 1][:2] + words[idx] # 지움 words[idx] = "" elif words[idx] =='ㄹ': if words[idx].count("|") > 0: words[idx] = "|일" else: words[idx] = "일" changed = "ᴥ".join([x for x in words if x is not ""])+"ᴥ" # For cases which wasn't covered, changed = self._makePretty(changed) changed = hgtk.text.compose(changed).replace("|"," ") # excetion 처리 try: if changed[-1] == '요': changed = re.sub('요', '', changed) changed = re.sub('그렇죠', '', changed) except: pass result.append(changed) else: try: result.append(stc) except: pass return result[0] def _makePretty(self, line): """ Convert the jaso orderings which wasn't properly covered by Jaso restructuring process of function Mal_Gillge_Haeraing :param line: jaso orderings which wasn't properly covered :return: Converted jaso ordering """ test = line test = test.replace("ᴥㅎㅏᴥㅇㅏᴥ", "ᴥㅎㅐᴥ") test = test.replace("ㅎㅏᴥㅇㅏᴥㅇㅛᴥ", "ᴥㅎㅐᴥ") test = test.replace("ㅎㅏᴥㄴㅣᴥㄷㅏᴥ", "ㅎㅏㅂᴥㄴㅣᴥㄷㅏᴥ") test = test.replace("ㅎㅏᴥㅇㅏㅆᴥ", "ᴥㅎㅐㅆᴥ") test = test.replace("ㄴㅏᴥㅇㅏㅆᴥ", "ᴥㅎㅐㅆᴥ") test = test.replace("ㄱㅏᴥㅇㅏㅆᴥ", "ᴥㄱㅏㅆᴥ") test = test.replace("ㅇㅣᴥㄴㅣᴥ", "ᴥㄴㅣᴥ") test = test.replace("ㄴㅓㄹㄴᴥ","ㄴㅓㄴᴥ") test = test.replace("ㄱㅡᴥㄹㅓㅎᴥㅇㅓᴥ","ㄱㅡᴥㄹㅐᴥ") test = test.replace("ㅡᴥㅇㅏᴥ","ㅏᴥ") test = test.replace("ㄱㅓㄹᴥㄴㅏᴥㅇㅛᴥ", "ㄱㅓㄴᴥㄱㅏᴥㅇㅛᴥ") return test def changer(self, text): """ change informal speech to formal speech Args : str """ tokens = self.kiwi.analyze(text.replace(" ","|")) key = formaldic().keys() key2 = abnormaldic().keys() lk = list(key) ak = list(key2) num = len(tokens[0][0]) result = [] for idx, token in enumerate(tokens[0][0]): if idx > int(num*0.8): if token[:2] in lk: #key로 value token = formaldic().get(token[:2]) result.append(token) else: if token[:2] in ak: token = abnormaldic().get(token[:2]) result.append(token) else: result.append(token[:2]) else: if token[:2] in ak: token = abnormaldic().get(token[:2]) result.append(token) else: result.append(token[:2]) # change tuple to text changed = '' for t in result: if isinstance(t[0], tuple): for i in range(len(t[0])): changed += hgtk.text.decompose(t[i][0]) else: changed += hgtk.text.decompose(t[0]) # Restructuring sentence from jaso ordering. one_char = re.compile('ᴥ[ㅂㄴㄹ]ᴥ') if one_char.search(changed): words = changed.split('ᴥ') for idx in range(1,len(words)): # 앞 글자가 종성이 없음 if len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 2: #앞 글자에 합침 words[idx - 1] = words[idx-1]+words[idx] words[idx] = "" # 있음 elif len(words[idx]) == 1 and len(words[idx-1].replace('|',"")) == 3: shp = ['ㅆ','ㅍ','ㄱ','ㅄ','ㄶ'] ep = ['ㄹ'] if words[idx] == 'ㅂ' and len(words[idx - 1].replace('|', "")) == 3 : if words[idx - 1][-1] in shp : if words[idx].count("|") > 0: words[idx] = "|습" else: words[idx ] = "습" continue else : if words[idx].count("|") > 0: words[idx] = "|입" else: words[idx] = "입" # words[idx] = "" elif words[idx] =='ㄴ' and len(words[idx-1].replace('|',"")) == 3 and words[idx - 1].endswith('ㄹ'): if words[idx-1].count("|") >0 : words[idx - 1] = "|" + words[idx - 1].replace("|","")[:2] + words[idx] else : words[idx - 1] = words[idx - 1][:2] + words[idx] # 지움 words[idx] = "" elif words[idx] =='ㄹ': if words[idx].count("|") > 0: words[idx] = "|일" else: words[idx] = "일" changed = "ᴥ".join([x for x in words if x is not ""])+"ᴥ" # For cases which wasn't covered, changed = self._makePretty(changed) changed = hgtk.text.compose(changed).replace("|"," ") return changed def addData(self, key, val): """ Add new data to dictionary, changer dictionary update :param key: key to be added into Dictionary self.replace :param val: Value to be added into Dictionary self.replace :return: None """ with open('dictionary.py', 'r', encoding='utf-8') as f: data = f.read() lines = data.split("\n") lines[-2] += ',' lines[-1] = " " + str(key) + ": " + str(val) with open('dictionary.py', 'w', encoding='utf-8') as f: for i in range(len(lines)): f.write(lines[i] + "\n") f.write(" }") def checker(self, result): """ Check the abnormal setnecnes and remove them. Args : result, updated, idx : list """ updated = [] idxes = [] normal = ['요', '까', '다', '죠', '가'] for idx, stc in enumerate(result): try: if stc[-1] not in normal: print(f"[INFO] Abnormal Sentence, remove {idx}....") idxes.append(idx) else: updated.append(stc) except: idxes.append(idx) return updated, idxes
kiwi.prepare() file_path = 'example.txt' if len(sys.argv) > 1 and sys.argv[1]: file = sys.argv[1] file_name, ext = file_path.split('.') valid_token_set = {'NNG', 'NNP', 'NNB'} print('=====') result = Counter() with open(file_path) as f: text = f.read() tokens = kiwi.analyze(text)[0][0] result = Counter([ (token[0], token[1]) for token in tokens if token[1] in valid_token_set ]) output_path = '.'.join([file_name, 'csv']) with open(output_path, 'w' ) as f: writer = csv.writer(f) writer.writerow(['형태소', '품사', 'count']) for key, value in result.items(): writer.writerow([key[0], key[1], value]) print(file_path, '>>', output_path, '추출 완료.')
rsc = r'E:\Programming\python\창회선배스터디\Morpheme_Cloud\자료\토지2.txt' target_corpus = codecs.open(rsc, 'r', encoding='utf-8') #텍스트 태깅 작업 tagged_temp = [] # with open(rsc, 'r', encoding="utf8") as kr_f: # for line in kr_f: # line = line.strip() # tagged_temp += flat(line) for i in target_corpus: i = i.strip() temp_tagging = [x[0] for x in tagger.analyze(i, top_n=1)] inner_temp = ["{}/{}".format(word, tag) for word, tag, score1, score2 in temp_tagging[0]] tagged_temp.append(tuple(inner_temp)) print(tagged_temp[:3]) tagged_list = [] # for i in tagged_temp: # if '/V' in i: # i = i.replace('/V','다/V') # tagged_list.append(tuple(i.split('/'))) # elif '/A' in i: # i.replace('/A','다/A') # tagged_list.append (tuple (i.split ('/'))) # else:
from kiwipiepy import Kiwi, __version__ print("kiwipiepy v{}".format(__version__)) kiwi = Kiwi() try: while True: txt = input('>>') res = kiwi.analyze(txt)[0] print(res) except EOFError: pass
self.output.write(' '.join(map(lambda x:x[0]+'/'+x[1], res[0][0])) + '\n') def __del__(self): self.input.close() self.output.close() kiwi = Kiwi() kiwi.load_user_dictionary(r'./server_project/test/userDict.txt') kiwi.add_user_word('iXVDR', 'NNP', 3.0) kiwi.prepare() # handle = IOHandler(r'./server_project/test/input.txt', r'./server_project/test/result.txt') # kiwi.analyze(handle.read, handle.write) result = kiwi.analyze('강남에서 먹었던 오늘의 스파게티는 맛있었다.', 1) for i in result: print(i) class ReaderExam: def __init__(self, filePath): self.file = open(filePath, encoding='UTF8') def read(self, id): if id == 0: self.file.seek(0) return self.file.readline() reader = ReaderExam(r'./server_project/test/input.txt') print(kiwi.extractWords(reader.read, 1, 10, 0.25)) #kiwi.extract_add_words(reader.read, min_cnt = 1, max_word_len = 10, min_score = 0.25, pos_score = -3)