예제 #1
0
def test_add_rule():
    kiwi = Kiwi()
    ores, oscore = kiwi.analyze("했어요! 하잖아요! 할까요?")[0]

    assert len(kiwi.add_re_rule("EF", r"요$", "용", score=0)) > 0
    res, score = kiwi.analyze("했어용! 하잖아용! 할까용?")[0]
    assert score == oscore

    kiwi = Kiwi()
    assert len(kiwi.add_re_rule("EF", r"요$", "용", score=-1)) > 0
    res, score = kiwi.analyze("했어용! 하잖아용! 할까용?")[0]
    assert score == oscore - 3
예제 #2
0
def test_add_pre_analyzed_word():
    kiwi = Kiwi()
    ores = kiwi.tokenize("팅겼어")

    try:
        kiwi.add_pre_analyzed_word("팅겼어", [("팅기", "VV"), "었/EP", "어/EF"])
        raise AssertionError("expected to raise `ValueError`")
    except ValueError:
        pass
    except:
        raise AssertionError("expected to raise `ValueError`")

    kiwi.add_user_word("팅기", "VV", orig_word="튕기")
    kiwi.add_pre_analyzed_word("팅겼어", [("팅기", "VV", 0, 2), ("었", "EP", 1, 2),
                                       ("어", "EF", 2, 3)])

    res = kiwi.tokenize("팅겼어...")

    assert res[0].form == "팅기" and res[0].tag == "VV" and res[
        0].start == 0 and res[0].end == 2
    assert res[1].form == "었" and res[1].tag == "EP" and res[
        1].start == 1 and res[1].end == 2
    assert res[2].form == "어" and res[2].tag == "EF" and res[
        2].start == 2 and res[2].end == 3
    assert res[3].form == "..." and res[3].tag == "SF" and res[
        3].start == 3 and res[3].end == 6
예제 #3
0
def test_tokenize_with_stopwords():
    kiwi = Kiwi()
    stopwords = Stopwords()
    tokens = kiwi.tokenize("[^^ 우리는 강아지를 좋아한다.]", stopwords=stopwords)

    assert tokens[0].form == '강아지'
    assert tokens[1].form == '좋아하'
예제 #4
0
def test_perform():
    kiwi = Kiwi()
    for res in kiwi.perform(FileReader(curpath +
                                       '/test_corpus/constitution.txt'),
                            min_cnt=2):
        print(res)
        break
예제 #5
0
def test_property():
    kiwi = Kiwi()
    print(kiwi.integrate_allomorph)
    kiwi.integrate_allomorph = False
    print(kiwi.integrate_allomorph)
    print(kiwi.cutoff_threshold)
    kiwi.cutoff_threshold = 1
    print(kiwi.cutoff_threshold)
예제 #6
0
def test_tokenize():
    kiwi = Kiwi()
    text = "다녀온 후기\n\n<강남 토끼정에 다녀왔습니다.> 음식도 맛있었어요 다만 역시 토끼정 본점 답죠?ㅎㅅㅎ 그 맛이 크으.. 아주 맛있었음...! ^^"
    tokens = kiwi.tokenize(text, normalize_coda=True)
    print(tokens)

    tokens_by_sent = kiwi.tokenize(text, normalize_coda=True, split_sents=True)
    for tokens in tokens_by_sent:
        print(tokens)
예제 #7
0
파일: changer.py 프로젝트: kosohae/AIpjt-1
 def __init__(self):    
     try:
         self.kiwi = Kiwi()
         self.kiwi.prepare()
     except:
         print("[INFO] please install kiwipiepy   ")
         
     self.replace = formaldic()
     self.utils = Utils()
예제 #8
0
def test_analyze_single():
    kiwi = Kiwi()
    for line in open(curpath + '/test_corpus/constitution.txt',
                     encoding='utf-8'):
        toks, score = kiwi.analyze(line)[0]
    for t in toks:
        print(t.form, t.tag, t.start, t.end, t.len, t.id, t.base_form,
              t.base_id)
        break
예제 #9
0
    def __init__(self, path):
        self.kiwi = Kiwi(options=Option.LOAD_DEFAULT_DICTIONARY
                         | Option.INTEGRATE_ALLOMORPH)
        self.kiwi.load_user_dictionary(path)
        self.kiwi.prepare()

        self.josa = [
            'JK', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC'
        ]
예제 #10
0
def index(request):
    context = {}

    # History 경로 생성
    homepath = os.path.expanduser("~")
    abs_chrome_path = os.path.join(homepath, 'AppData', 'Local', 'Google', 'Chrome', 'User Data', 'Default', 'History')
    # History 파일 복사
    shutil.copyfile(abs_chrome_path, abs_chrome_path+"_sample")
    # 복사본 데이터 추출
    con = sqlite3.connect(abs_chrome_path+"_sample")
    cursor = con.cursor()
    cursor.execute("SELECT term FROM keyword_search_terms")
    term_data = cursor.fetchall()

    # 형태소 분석
    kiwi = Kiwi()
    kiwi.prepare()
    word_list = []
    for term in term_data:
        for word, tag, _, _ in kiwi.analyze(term[0], top_n=1)[0][0]:
            if tag in ['NNG','NNP','NNB','SL']:
                word_list.append(word)
    
    # count
    counts = Counter(word_list)
    tags = counts.most_common()

    # wordcloud
    mask = plt.imread("./static/images/mask.jpg")
    wc = WordCloud(font_path='./static/webfonts/NanumBarunGothicBold.ttf',
                    background_color='white', 
                    width=800, 
                    height=800,
                    mask=mask)

    cloud = wc.generate_from_frequencies(dict(tags))
    plt.figure(figsize=(10, 8))
    plt.axis('off')
    plt.imshow(cloud,  interpolation="bilinear")
    plt.savefig("./static/images/wordcloud_keyword.png", dpi=300, bbox_inches='tight')

    # 상위 9개 단어
    
    top9_list = []
    for rank in range(9):
        top9 = {}
        top9['rank'] = rank+1
        top9['word'] = tags[rank][0]
        top9['count'] = tags[rank][1]
        top9_list.append(top9)

    context['top9'] = top9_list

    return render(request, 'mainapp/index.html', context)
예제 #11
0
def test_split_into_sents():
    kiwi = Kiwi()
    text = "다녀온 후기\n\n<강남 토끼정에 다녀왔습니다.> 음식도 맛있었어요 다만 역시 토끼정 본점 답죠?ㅎㅅㅎ 그 맛이 크으.. 아주 맛있었음...! ^^"
    sents = kiwi.split_into_sents(text, normalize_coda=True)
    assert len(sents) == 6

    assert sents[0].text == "다녀온 후기"
    assert sents[1].text == "<강남 토끼정에 다녀왔습니다.>"
    assert sents[2].text == "음식도 맛있었어요"
    assert sents[3].text == "다만 역시 토끼정 본점 답죠?ㅎㅅㅎ"
    assert sents[4].text == "그 맛이 크으.."
    assert sents[5].text == "아주 맛있었음...! ^^"
예제 #12
0
def test_kiwi(environ):
    _, tagger = environ
    kiwi = Kiwi()
    kiwi.prepare()

    for _, line in EXAMPLES:
        res1 = tagger.tagSentence(line)[0]
        res2 = kiwi.analyze(line)

        res1 = [(m.surface, m.originalTag) for w in res1 for m in w]
        res2 = [m[:2] for m in res2[0][0]]

        assert res1 == res2
예제 #13
0
def prepare_kiwi(train_file):
    """
    input: train file i.e. corpora
    output: kiwi model
    """
    numThread = 4
    kiwi = Kiwi(numThread)
    reader = ReaderExam(train_file)
    minCount = 5
    maxWordLength = 6
    minScore = 0.25
    kiwi.extractWords(reader.read, minCount, maxWordLength, minScore)
    kiwi.prepare()
    return kiwi
예제 #14
0
def test_stopwords():
    kiwi = Kiwi()
    tokens, _ = kiwi.analyze('불용어 처리 테스트 중입니다 '
                             '우리는 강아지를 좋아한다 쟤도 강아지를 좋아한다 '
                             '지금은 2021년 11월이다.')[0]
    stopwords = Stopwords()
    print(set(tokens) - set(stopwords.filter(tokens)))
    filename = curpath + '/test_corpus/custom_stopwords.txt'
    stopwords = Stopwords(filename)

    stopwords.add(('강아지', 'NNP'))
    assert (('강아지', 'NNP') in stopwords) == True

    stopwords.remove(('강아지', 'NNP'))
    assert (('강아지', 'NNP') in stopwords) == False
    print(set(tokens) - set(stopwords.filter(tokens)))
예제 #15
0
def test_extract_words():
    kiwi = Kiwi()
    ret = kiwi.extract_words(FileReader(curpath +
                                        '/test_corpus/constitution.txt'),
                             min_cnt=2)
    print(ret)
예제 #16
0
 def __init__(self):
     import kiwipiepy
     from kiwipiepy import Kiwi
     print("Initialize kiwipiepy ({})".format(kiwipiepy.__version__),
           file=sys.stderr)
     self._mdl = Kiwi()
예제 #17
0
from kiwipiepy import Kiwi, Option
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import codecs

tagger = Kiwi()
tagger.prepare()

# def flat(content):
#     return ["{}/{}".format(word,tag) for word, tag in tagger.pos(content)

rsc = r'E:\Programming\python\창회선배스터디\Morpheme_Cloud\자료\토지2.txt'

target_corpus = codecs.open(rsc, 'r', encoding='utf-8')

#텍스트 태깅 작업
tagged_temp = []

# with open(rsc, 'r', encoding="utf8") as kr_f:
#     for line in kr_f:
#         line = line.strip()
#         tagged_temp += flat(line)

for i in target_corpus:
    i = i.strip()
    temp_tagging = [x[0] for x in tagger.analyze(i, top_n=1)]
    inner_temp = ["{}/{}".format(word, tag) for word, tag, score1, score2 in temp_tagging[0]]
    tagged_temp.append(tuple(inner_temp))

print(tagged_temp[:3])
예제 #18
0
파일: Preproc.py 프로젝트: sourcreams/InvDB
import openpyxl
import re
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi, Option

file_Name = r"C:\Users\6408284\Desktop\투자비시스템개선\WBS표준화\2013 - 2019.xlsx"
file_Name = r"C:\Users\6408284\Desktop\투자비시스템개선\WBS표준화\2013 - 2019.XLSX"
kiwi = Kiwi(num_workers=0)


class ReaderExam:
    def __init__(self, filePath):
        self.file = open(filePath)

    def read(self, id):
        if id == 0: self.file.seek(0)
        return self.file.readline()


def dict_test():
    kiwi.load_user_dictionary('user_dic.txt')
    kiwi.prepare()
    data = np.loadtxt('test.txt', dtype=np.str, delimiter="\n")
    data_list = data.tolist()

    result_data = list()

    for result in kiwi.analyze(data_list, top_n=1):
        tokens, score = result[0]
        #print(tokens)
data.head(2)

data['token'].isnull().sum()
# overview에서 Null 값을 가진 경우에는 값 제거
data['token'] = data['token'].fillna('')

#%% 형태소 분석이 끝난 후의 질문들로 학습 벡터 생성
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data['token'])
# overview에 대해서 tf-idf 수행
print(tfidf_matrix.shape)

# %%
from kiwipiepy import Kiwi
kiwi = Kiwi()
kiwi.load_user_dictionary(r'./userDict.txt')
kiwi.prepare()
def generate_morp_word(sentence,analyze_num):
    try:
        result = kiwi.analyze(sentence, analyze_num)
        print(result)
        morp_word_list =[]
        morp_nn_list=[]
        morp_vv_list=[]

        for i in range(0, analyze_num):
            morp_word = ''
            morp_nn=''
            morp_vv=''
            #print(i)
예제 #20
0
def test_bug_38():
    text = "이 예쁜 꽃은 독을 품었지만 진짜 아름다움을 가지고 있어요"
    kiwi = Kiwi(integrate_allomorph=True)
    print(kiwi.analyze(text))
    kiwi = Kiwi(integrate_allomorph=False)
    print(kiwi.analyze(text))
예제 #21
0
def test_bug_33():
    kiwi = Kiwi()
    kiwi.add_user_word('김갑갑', 'NNP')
    print(kiwi.analyze("김갑갑 김갑갑 김갑갑"))
예제 #22
0
def test_new_analyze_multi():
    kiwi = Kiwi()
    for res in kiwi.analyze(
            open(curpath + '/test_corpus/constitution.txt', encoding='utf-8')):
        pass
예제 #23
0
def test_tweet():
    kiwi = Kiwi()
    kiwi.analyze('''#바둑#장기#오목 귀요미#보드판🐥
#어린이임블리의 놀이였는데, 이제는 가물갸물🙄모르겠
장이요~멍이요~ㅎㅎㅎ다시 한 번 재미를 붙여 보까ㅎ
할 일이 태산인데😭, 하고 싶은건 무궁무진🤦‍♀️ 큰 일이다''')