Exemplo n.º 1
0
def name_tokenize(sentence):
    api = kh.KhaiiiApi()
    api.open()
    x = re.sub('[-\.\$\+>#\}\{\*<&@%;\\\)\(="?!\[\]~\^/:,_\|]+', '.',
               str(sentence).replace("'", "")).replace('.', ' ')  # 특수 기호 제거

    res = []
    try:
        for word in api.analyze(x):
            if re.match('\d+', word.lex) != None:
                tmp = ''
                for m in word.morphs:  # '2000년대','2000년대의'에서 '2000년대'만 뽑아내는 코드
                    if m.tag in ['NNG', 'NNP', 'NNB', 'SN']:
                        tmp += m.lex
                res.append(tmp)
            else:
                a = len(res)
                for i in genre:
                    if i in word.lex:
                        res.append(i)  # '발라드', '메탈'처럼 장르명이 제목에 있으면 바로 리스트에 추가

                for j in artist:
                    if j in word.lex and len(
                            j) > 1:  # 두 글자 이상의 아티스트 이름이 단어에 들어있으면 리스트에 추가
                        res.append(j)
                b = len(res)
                if a == b:
                    for m in word.morphs:
                        if m.tag in ['NNG', 'NNP', 'NNB', 'SN',
                                     'SL']:  # 명사와 숫자, 외국어만 받습니다
                            res.append(m.lex)
    except:
        pass

    return ' '.join(res)
Exemplo n.º 2
0
def preprocessing(data):
    try:
        from konlpy.tag import Okt, Kkma
        import khaiii
        khaiii_api = khaiii.KhaiiiApi(opt.khaiii_so_path)
        khaiii_api.open(opt.khaiii_path)

        kkma = Kkma()
        kkma_tokenizer = kkma.nouns
        twitter = Okt()
        okt_tokenizer = twitter.nouns

        cls, data_path_list, div, out_path, begin_offset, end_offset = data
        data = cls()
        data.load_y_vocab()
        data.preprocessing(data_path_list, div, begin_offset, end_offset,
                           out_path, okt_tokenizer, khaiii_api, kkma_tokenizer)
    except Exception:
        raise Exception("".join(traceback.format_exception(*sys.exc_info())))
Exemplo n.º 3
0
def kakao_postagger_nn_finder(summay_text):
    api = khaiii.KhaiiiApi()
    api.open()
    nn_word_list = []
    for word in api.analyze(summay_text):
        morphs_str = ' + '.join([(m.lex + '/' + m.tag) for m in word.morphs])
        # print(f'{word.lex}\t{morphs_str}')

        morphs_str_list = morphs_str.split(" + ")

        complex_morphs = ""
        for mophs_item in morphs_str_list:
            if mophs_item.split("/")[1].startswith("N") or mophs_item.split("/")[1].startswith("MM") or \
                    mophs_item.split("/")[1].startswith("SN") or mophs_item.split("/")[1].startswith("SL"):
                complex_morphs = complex_morphs + mophs_item.split("/")[0]

        if len(complex_morphs) > 1:
            # print("->", complex_morphs)
            nn_word_list.append(complex_morphs)

    return nn_word_list
Exemplo n.º 4
0
def sent2khaiii(data, tag=False):
    '''
    카이 형태소 분석기
    
    data: 데이터 (str or list(Series))
    tag: 태그 포함 여부 (False or True)          
    
    예시 
        khaii(["우리집에 왜 왔니", "왜 왔니"])
        >>> [['우리', '집', '에', '왜', '오', '았', '니'], ['왜', '오', '았', '니']]

        khaii(["우리집에 왜 왔니", "왜 왔니"], tag=True)
        >>> [['우리/NP', '집/NNG', '에/JKB', '왜/MAG', '오/VV', '았/EP', '니/EC'],
            ['왜/MAG', '오/VV', '았/EP', '니/EC']]
    '''
    import khaiii
    api = khaiii.KhaiiiApi()
    api.open()
    if type(data) == str: data = [data]
    return [[
        a.lex + "/" + a.tag if tag == True else a.lex
        for word in api.analyze(str(sent)) for a in word.morphs
    ] if str(sent).strip() else (sent) for sent in data]
 def setUp(self):
     self._api = khaiii.KhaiiiApi()
     self._api.set_log_level('all', 'warn')
     self._api.open()
Exemplo n.º 6
0
import http.client, urllib.parse

from flask import (Flask, request, abort, render_template, Response, jsonify)
from flask_cors import CORS

import nltk
from wiktionaryparser import WiktionaryParser

from tagmap import TagMap
from chunker import Chunker

# ---------- initialize KHaiii phoneme analyzer

# set up KHaiii api
import khaiii
khaiiiAPI = khaiii.KhaiiiApi()
khaiiiAPI.open()

# ---------- instantiate Flask (global) app  --------

parserApp = Flask('app',
                  static_folder="./dist/static",
                  template_folder="./dist")
CORS(parserApp)
#
parserApp.config.update(DEBUG=True,
                        SECRET_KEY="iu877hy3jnd8**yd98y334%$#Rjxhg6222",
                        SESSION_COOKIE_HTTPONLY=False)


def run_dev_server():
 def __init__(self):
     self.tokenizer = khaiii.KhaiiiApi()
     super().__init__(self.tokenizer.analyze)
Exemplo n.º 8
0
import khaiii

api = khaiii.KhaiiiApi()
api.open()


class Khaiii():
    def pos(self, phrase, flatten=True, join=False):
        """POS tagger.

        :param flatten: If False, preserves eojeols.
        :param join: If True, returns joined sets of morph and tag.

        """
        sentences = phrase.split('\n')
        morphemes = []
        if not sentences:
            return morphemes

        for sentence in sentences:
            for word in api.analyze(sentence):
                result = [(m.lex, m.tag) for m in word.morphs]
                if join:
                    result = [
                        '{}/{}'.format(m.lex, m.tag) for m in word.morphs
                    ]

                morphemes.append(result)

        if flatten:
            return sum(morphemes, [])