def test_add_rule(): kiwi = Kiwi() ores, oscore = kiwi.analyze("했어요! 하잖아요! 할까요?")[0] assert len(kiwi.add_re_rule("EF", r"요$", "용", score=0)) > 0 res, score = kiwi.analyze("했어용! 하잖아용! 할까용?")[0] assert score == oscore kiwi = Kiwi() assert len(kiwi.add_re_rule("EF", r"요$", "용", score=-1)) > 0 res, score = kiwi.analyze("했어용! 하잖아용! 할까용?")[0] assert score == oscore - 3
def test_add_pre_analyzed_word(): kiwi = Kiwi() ores = kiwi.tokenize("팅겼어") try: kiwi.add_pre_analyzed_word("팅겼어", [("팅기", "VV"), "었/EP", "어/EF"]) raise AssertionError("expected to raise `ValueError`") except ValueError: pass except: raise AssertionError("expected to raise `ValueError`") kiwi.add_user_word("팅기", "VV", orig_word="튕기") kiwi.add_pre_analyzed_word("팅겼어", [("팅기", "VV", 0, 2), ("었", "EP", 1, 2), ("어", "EF", 2, 3)]) res = kiwi.tokenize("팅겼어...") assert res[0].form == "팅기" and res[0].tag == "VV" and res[ 0].start == 0 and res[0].end == 2 assert res[1].form == "었" and res[1].tag == "EP" and res[ 1].start == 1 and res[1].end == 2 assert res[2].form == "어" and res[2].tag == "EF" and res[ 2].start == 2 and res[2].end == 3 assert res[3].form == "..." and res[3].tag == "SF" and res[ 3].start == 3 and res[3].end == 6
def test_tokenize_with_stopwords(): kiwi = Kiwi() stopwords = Stopwords() tokens = kiwi.tokenize("[^^ 우리는 강아지를 좋아한다.]", stopwords=stopwords) assert tokens[0].form == '강아지' assert tokens[1].form == '좋아하'
def test_perform(): kiwi = Kiwi() for res in kiwi.perform(FileReader(curpath + '/test_corpus/constitution.txt'), min_cnt=2): print(res) break
def test_property(): kiwi = Kiwi() print(kiwi.integrate_allomorph) kiwi.integrate_allomorph = False print(kiwi.integrate_allomorph) print(kiwi.cutoff_threshold) kiwi.cutoff_threshold = 1 print(kiwi.cutoff_threshold)
def test_tokenize(): kiwi = Kiwi() text = "다녀온 후기\n\n<강남 토끼정에 다녀왔습니다.> 음식도 맛있었어요 다만 역시 토끼정 본점 답죠?ㅎㅅㅎ 그 맛이 크으.. 아주 맛있었음...! ^^" tokens = kiwi.tokenize(text, normalize_coda=True) print(tokens) tokens_by_sent = kiwi.tokenize(text, normalize_coda=True, split_sents=True) for tokens in tokens_by_sent: print(tokens)
def __init__(self): try: self.kiwi = Kiwi() self.kiwi.prepare() except: print("[INFO] please install kiwipiepy ") self.replace = formaldic() self.utils = Utils()
def test_analyze_single(): kiwi = Kiwi() for line in open(curpath + '/test_corpus/constitution.txt', encoding='utf-8'): toks, score = kiwi.analyze(line)[0] for t in toks: print(t.form, t.tag, t.start, t.end, t.len, t.id, t.base_form, t.base_id) break
def __init__(self, path): self.kiwi = Kiwi(options=Option.LOAD_DEFAULT_DICTIONARY | Option.INTEGRATE_ALLOMORPH) self.kiwi.load_user_dictionary(path) self.kiwi.prepare() self.josa = [ 'JK', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC' ]
def index(request): context = {} # History 경로 생성 homepath = os.path.expanduser("~") abs_chrome_path = os.path.join(homepath, 'AppData', 'Local', 'Google', 'Chrome', 'User Data', 'Default', 'History') # History 파일 복사 shutil.copyfile(abs_chrome_path, abs_chrome_path+"_sample") # 복사본 데이터 추출 con = sqlite3.connect(abs_chrome_path+"_sample") cursor = con.cursor() cursor.execute("SELECT term FROM keyword_search_terms") term_data = cursor.fetchall() # 형태소 분석 kiwi = Kiwi() kiwi.prepare() word_list = [] for term in term_data: for word, tag, _, _ in kiwi.analyze(term[0], top_n=1)[0][0]: if tag in ['NNG','NNP','NNB','SL']: word_list.append(word) # count counts = Counter(word_list) tags = counts.most_common() # wordcloud mask = plt.imread("./static/images/mask.jpg") wc = WordCloud(font_path='./static/webfonts/NanumBarunGothicBold.ttf', background_color='white', width=800, height=800, mask=mask) cloud = wc.generate_from_frequencies(dict(tags)) plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(cloud, interpolation="bilinear") plt.savefig("./static/images/wordcloud_keyword.png", dpi=300, bbox_inches='tight') # 상위 9개 단어 top9_list = [] for rank in range(9): top9 = {} top9['rank'] = rank+1 top9['word'] = tags[rank][0] top9['count'] = tags[rank][1] top9_list.append(top9) context['top9'] = top9_list return render(request, 'mainapp/index.html', context)
def test_split_into_sents(): kiwi = Kiwi() text = "다녀온 후기\n\n<강남 토끼정에 다녀왔습니다.> 음식도 맛있었어요 다만 역시 토끼정 본점 답죠?ㅎㅅㅎ 그 맛이 크으.. 아주 맛있었음...! ^^" sents = kiwi.split_into_sents(text, normalize_coda=True) assert len(sents) == 6 assert sents[0].text == "다녀온 후기" assert sents[1].text == "<강남 토끼정에 다녀왔습니다.>" assert sents[2].text == "음식도 맛있었어요" assert sents[3].text == "다만 역시 토끼정 본점 답죠?ㅎㅅㅎ" assert sents[4].text == "그 맛이 크으.." assert sents[5].text == "아주 맛있었음...! ^^"
def test_kiwi(environ): _, tagger = environ kiwi = Kiwi() kiwi.prepare() for _, line in EXAMPLES: res1 = tagger.tagSentence(line)[0] res2 = kiwi.analyze(line) res1 = [(m.surface, m.originalTag) for w in res1 for m in w] res2 = [m[:2] for m in res2[0][0]] assert res1 == res2
def prepare_kiwi(train_file): """ input: train file i.e. corpora output: kiwi model """ numThread = 4 kiwi = Kiwi(numThread) reader = ReaderExam(train_file) minCount = 5 maxWordLength = 6 minScore = 0.25 kiwi.extractWords(reader.read, minCount, maxWordLength, minScore) kiwi.prepare() return kiwi
def test_stopwords(): kiwi = Kiwi() tokens, _ = kiwi.analyze('불용어 처리 테스트 중입니다 ' '우리는 강아지를 좋아한다 쟤도 강아지를 좋아한다 ' '지금은 2021년 11월이다.')[0] stopwords = Stopwords() print(set(tokens) - set(stopwords.filter(tokens))) filename = curpath + '/test_corpus/custom_stopwords.txt' stopwords = Stopwords(filename) stopwords.add(('강아지', 'NNP')) assert (('강아지', 'NNP') in stopwords) == True stopwords.remove(('강아지', 'NNP')) assert (('강아지', 'NNP') in stopwords) == False print(set(tokens) - set(stopwords.filter(tokens)))
def test_extract_words(): kiwi = Kiwi() ret = kiwi.extract_words(FileReader(curpath + '/test_corpus/constitution.txt'), min_cnt=2) print(ret)
def __init__(self): import kiwipiepy from kiwipiepy import Kiwi print("Initialize kiwipiepy ({})".format(kiwipiepy.__version__), file=sys.stderr) self._mdl = Kiwi()
from kiwipiepy import Kiwi, Option from wordcloud import WordCloud import matplotlib.pyplot as plt from collections import Counter, defaultdict import codecs tagger = Kiwi() tagger.prepare() # def flat(content): # return ["{}/{}".format(word,tag) for word, tag in tagger.pos(content) rsc = r'E:\Programming\python\창회선배스터디\Morpheme_Cloud\자료\토지2.txt' target_corpus = codecs.open(rsc, 'r', encoding='utf-8') #텍스트 태깅 작업 tagged_temp = [] # with open(rsc, 'r', encoding="utf8") as kr_f: # for line in kr_f: # line = line.strip() # tagged_temp += flat(line) for i in target_corpus: i = i.strip() temp_tagging = [x[0] for x in tagger.analyze(i, top_n=1)] inner_temp = ["{}/{}".format(word, tag) for word, tag, score1, score2 in temp_tagging[0]] tagged_temp.append(tuple(inner_temp)) print(tagged_temp[:3])
import openpyxl import re import pandas as pd import numpy as np from kiwipiepy import Kiwi, Option file_Name = r"C:\Users\6408284\Desktop\투자비시스템개선\WBS표준화\2013 - 2019.xlsx" file_Name = r"C:\Users\6408284\Desktop\투자비시스템개선\WBS표준화\2013 - 2019.XLSX" kiwi = Kiwi(num_workers=0) class ReaderExam: def __init__(self, filePath): self.file = open(filePath) def read(self, id): if id == 0: self.file.seek(0) return self.file.readline() def dict_test(): kiwi.load_user_dictionary('user_dic.txt') kiwi.prepare() data = np.loadtxt('test.txt', dtype=np.str, delimiter="\n") data_list = data.tolist() result_data = list() for result in kiwi.analyze(data_list, top_n=1): tokens, score = result[0] #print(tokens)
data.head(2) data['token'].isnull().sum() # overview에서 Null 값을 가진 경우에는 값 제거 data['token'] = data['token'].fillna('') #%% 형태소 분석이 끝난 후의 질문들로 학습 벡터 생성 from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() tfidf_matrix = tfidf.fit_transform(data['token']) # overview에 대해서 tf-idf 수행 print(tfidf_matrix.shape) # %% from kiwipiepy import Kiwi kiwi = Kiwi() kiwi.load_user_dictionary(r'./userDict.txt') kiwi.prepare() def generate_morp_word(sentence,analyze_num): try: result = kiwi.analyze(sentence, analyze_num) print(result) morp_word_list =[] morp_nn_list=[] morp_vv_list=[] for i in range(0, analyze_num): morp_word = '' morp_nn='' morp_vv='' #print(i)
def test_bug_38(): text = "이 예쁜 꽃은 독을 품었지만 진짜 아름다움을 가지고 있어요" kiwi = Kiwi(integrate_allomorph=True) print(kiwi.analyze(text)) kiwi = Kiwi(integrate_allomorph=False) print(kiwi.analyze(text))
def test_bug_33(): kiwi = Kiwi() kiwi.add_user_word('김갑갑', 'NNP') print(kiwi.analyze("김갑갑 김갑갑 김갑갑"))
def test_new_analyze_multi(): kiwi = Kiwi() for res in kiwi.analyze( open(curpath + '/test_corpus/constitution.txt', encoding='utf-8')): pass
def test_tweet(): kiwi = Kiwi() kiwi.analyze('''#바둑#장기#오목 귀요미#보드판🐥 #어린이임블리의 놀이였는데, 이제는 가물갸물🙄모르겠 장이요~멍이요~ㅎㅎㅎ다시 한 번 재미를 붙여 보까ㅎ 할 일이 태산인데😭, 하고 싶은건 무궁무진🤦♀️ 큰 일이다''')