if len(data)>2:
            KNU_df.loc[count,"ngram"] = " ".join(data[:-1])
        else:
            KNU_df.loc[count, "ngram"] = data[0]
        KNU_df.loc[count,"sentiment"] = data[-1]
        count+=1

print(KNU_df)
KNU_df.to_csv("KNU_lexicon.tsv",encoding="utf8",sep="\t", index=False)

'''
KNU_df = pd.read_csv("KNU_lexicon.tsv", encoding="utf8", sep="\t")

#pos_tagger = Kkma()
#print(pos_tagger.pos(u"함찬"))
#KNU_df["ngram"].apply(pos_tagger.pos)
print(KNU_df)

data = ";".join(SNU_df["ngram"]).split(";")
data = list(map((lambda x: list(x.split("/"))), data))

pos = sorted(set(np.array(data)[:, -1]))
kkma = sorted(set(Kkma().tagset.keys()))
komoran = sorted(set(Komoran().tagset.keys()))
hannanum = sorted(set(Hannanum().tagset.keys()))

print(len(pos), "+", len(kkma), "=", len(pos + kkma) - len(set(pos + kkma)))
print(len(pos), "+", len(komoran), "=",
      len(pos + komoran) - len(set(pos + komoran)))
print(len(pos), "+", len(hannanum), "=",
      len(pos + hannanum) - len(set(pos + hannanum)))
Exemplo n.º 2
0


from konlpy.tag import Hannanum
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud

file_name = input("엑셀 파일 이름을 확장자 제외하고 넣으세요 \n")
number = int(input("상위 n개의 단어를 워드클라우드로 만듭니다. n을 넣으세요. ex)10 \n" ))
    
f = open(RESULT_PATH + file_name + '.txt', 'r', encoding="UTF-8") #다시 읽기
data = f.read()

try:
    engin = Hannanum()
    nouns = engin.nouns(data)

    nouns = [ n for n in nouns if len(n) > 1 ]
    count = Counter(nouns)
    tags = count.most_common(number)
    print(tags)

    wordcloud = WordCloud(font_path="C:/Windows/Fonts/malgun.ttf",
                      background_color='white', width=640, height=480)

    wordcloud.generate_from_frequencies(dict(tags))

    fig = plt.figure()
    plt.axis('off')
    plt.imshow(wordcloud)
Exemplo n.º 3
0
def get_tags(text, ntags=50, multiplier=10):
    h = Hannanum()
    nouns = h.nouns(text)
    count = Counter(nouns)
    return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\
                for n, c in count.most_common(ntags)]
Exemplo n.º 4
0
# 설치한 matplotlib 외부라이브러리의 pyplot 기능을 사용하며, pyplot 기능을 약어로 plt로 정의
import matplotlib.pyplot as plt

# 설치한 konlpy 외부라이브러리로부터 Hannanum 기능 사용하도록 설정
from konlpy.tag import Hannanum

# 문자열(문장) 수정을 위한 파이썬 기본 기능 추가
import re

# 분석 대상 문장 / 단어별 구분자 스페이스(공백 한칸)를 이용함
text = open("contents.txt", encoding="UTF-8").read()

print(text)

# myHannanum 변수에 Hannanum 기능을 넣어줌
myHannanum = Hannanum()

# 단어 분석의 정확도를 높이기 위해 특수문자 제거
# 특수문자는 키보드 상단 숫자패드의 특수문자가 발견되면 한칸 공백으로 변경
replace_text = re.sub("[!@#$%^&*()_+]", " ", text)

# 특수문자가 제거된 문장 출력
print(replace_text)

# 명사 분석 결과는 여러 단어들이 저장된 배열형태로 데이터를 생성하기 때문에 배열을 문자열로 변경하기위해
# join 함수를 사용하며, analysis_text 변수에 문자열로 변환된 결과를 저장함
analysis_text = (" ".join(myHannanum.nouns(replace_text)))

# stopwords 변수에 원하지 않는 단어들 추가
stopwords = set(STOPWORDS)
# stopwords.add("분석")
Exemplo n.º 5
0
from konlpy.tag import Hannanum
import numpy as np
import keras.preprocessing.text

hashing_dimension = 1000

# bad programming practice lol
global_hannanum = Hannanum()


def tokenize(text):
    return list(p[0] + "/" + p[1] for p in global_hannanum.pos(text))


def read_data_file(data_filename):
    rows = []
    with open(data_filename, 'r') as file:
        rows = file.readlines()

    parsed = list(map(parse_row, rows))
    (X, y) = divide_to_xy(parsed)
    return (hash_data(X), hash_data(y))


# returns a tuple of form
# sbj entity
# obj entity
# relation
# original source sentence (ie. sbj and obj tags replaced) tokenized
def parse_row(data_row):
    parts = data_row.split("\t")
Exemplo n.º 6
0
@author: Ri
"""

from konlpy import data ###how to update user dictionary
from konlpy.tag import Kkma, Hannanum
from konlpy.utils import pprint
from collections import Counter
import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

os.getcwd()
os.chdir("C:/Users/Ri/Dropbox/paper/python_result")
h = Hannanum()  #한나눔 사전(카이스트)
k = Kkma()  #꼬꼬마형태소 분석기(서울대 IDS)


def mergingNews(filepath):
    news=pd.DataFrame()   
    for filename in os.listdir(filepath):               
        print(filename)
        loc = filepath + '/' + filename 
        f = pd.read_csv(loc, encoding='UTF8')
        f['datetime']=f['datetime'].apply(lambda x: x[:10])
        f['body']=f['body'].map(lambda x: x.replace(u"// flash 오류를 우회하기 위한 함수 추가 function _flash_removeCallback() {}",""))
        f['contents']=f['title'].map(str)+"/"+f['body']
        f['filename']=filename[:-4]
        news=news.append(f)
#        news.to_csv("news.csv")
Exemplo n.º 7
0
        # 종성 범위 밖에 있는 것들은 end_char로 메꿔준다.
        if jongsung_index == 0:
            jongsung = end_char
        
        result.append(chosung)
        result.append(joongsung)
        result.append(jongsung)

    return "".join(result)

# khaiii
khaiii = KhaiiiApi()
def khaiii_tokenize(text):
    tokens = []
    for word in khaiii.analyze(text):
        tokens.extend([str(m).split('/')[0] for m in word.morphs])
    return tokens

# konlpy tokenizers
mecab = Mecab().morphs
okt = Okt().morphs
komoran = Komoran().morphs
hannanum = Hannanum().morphs # 오류 발생 
kkma = Kkma().morphs

def space_tokenizer(text):
    return text.split(' ')

def char_tokenizer(text):
    return [t for t in text]
Exemplo n.º 8
0
    def wordcloud(self, period, topic):
        """
        한 기간의 댓글 모음을 이용하여 빈도수로 워드 클라우드를 생성하여 반환
        :return: wordcloud 이미지
        """

        # 일단 텍스트 파일에서 가져오는 걸로 해놈
        text = ''
        THIS_FOLDER = os.path.dirname(os.path.abspath(__file__))
        file_reactions = os.path.join(THIS_FOLDER, "%s_%d.txt" % (topic, period))
        # with open(os.path.join(settings.BASE_DIR, file_reactions), encoding="utf-8") as f:
        with open(file_reactions, encoding="utf-8") as f:
            text = f.read()
            #first_char = f.read(1)
            if not text:
                print("file is empty")
                # 임시 텍스트 파일 삭제
                os.remove(file_reactions)
                return
            else:
                print("file is full")

        # 한나눔 사용 변수 지정
        hannanum = Hannanum()

        # 명사 분석
        nouns = hannanum.nouns(text)
        # print(nouns)

        # midle_time = time.time()

        # 명사 중에 길이가 2이상일때만
        words = []
        for n in nouns:
            if len(n) > 1:
                words.append(n)

        # 빈도수 계산
        count = Counter(words)
        #count = Counter(nouns)

        # 상위 100건 추출
        most = count.most_common(100)

        # 딕셔너리 구성
        tags = {}
        for n, c in most:
            tags[n] = c

        # 워드 클라우드 생성
        print(topic)
        file_name = (topic + "_%d" % self.model_topic.id + "_%d.png" % period)  # topic 2개이면 파일명 'topic1_topic2_10_1.png'
        mask = np.array(Image.open(os.path.join(THIS_FOLDER, 'mask_img_512px.png')))
        wc_image_gen = wc(font_path=os.path.join(THIS_FOLDER, 'NanumSquareEB.ttf'),
                            mask=mask,
                          background_color='white'
                          ).generate_from_frequencies(tags)   # wordcloud 용 댓글 하나도 없을 경우..
        wc_image_gen.to_file(os.path.join(THIS_FOLDER,file_name))  # 경로에 이미지 파일 생성

    #file_reactions = os.path.join(THIS_FOLDER, "%s_%d.txt" % (topic, period))
        with open(os.path.join(THIS_FOLDER, file_name), 'rb') as tmp_file:
            # tmp_wc = WordCloud(period=period, topic=self.model_topic)
            # get해서 해야됨. -> django db 에 저장
            tmp_wc = WordCloud.objects.filter(topic=self.model_topic, period=period).first()
            tmp_wc.wcphoto.save(file_name, File(tmp_file))

        # 임시 이미지 파일 삭제
        os.remove(os.path.join(THIS_FOLDER, file_name))
        # 임시 텍스트 파일 삭제
        os.remove(file_reactions)
        """
Exemplo n.º 9
0
from konlpy.tag import Mecab, Okt, Komoran, Hannanum, Kkma
from khaiii_pos.khaiii_pos import Khaiii
import json


tokenizers = {
    'Mecab':Mecab(),
    'Okt':Okt(),
    'Komoran':Komoran(),
    'Hannanum':Hannanum(),
    'Kkma':Kkma(),
    'Khaiii':Khaiii()
}

corpus_fname = 'raw/korquad/KorQuAD_v1.0_train.json'

with open(corpus_fname) as f_corpus:
    dataset_json = json.load(f_corpus)
    dataset = dataset_json['data']

    for i, article in enumerate(dataset):
        # w_lines = []
        if i > 10 :
            break

        for paragraph in article['paragraphs']:
            print('--------------------------------------')
            print('[ORIGINAL] ', paragraph['context'])

            for key, tokenizer in  tokenizers.items():
                print('[%8s] %s' % (key, tokenizer.pos(paragraph['context'])))
Exemplo n.º 10
0
class konlpy():
    os.environ[
        "NLS_LANG"] = ".AL32UTF8"  # UTF-8 : .AL32UTF8, CP949 : .KO16MSWIN949
    con = pymongo.MongoClient("localhost", 27017)['jcjc']
    #     conn = cx_Oracle.connect('bigdata/admin1234@localhost:1521/xe') # oracle 서버와 연결 (connection 맺기)

    # userdic : 단어 추출에서 제외할 목록들이 들어갈 단어사전
    komoran = Komoran(
        userdic='C:\\Project\\morphemeAnalysis\\com\\test\\user_dic.txt')
    hannanum = Hannanum()

    def analyze(self):
        count = 0
        for item in self.con['bill'].find(
            {"$or": [{
                "proposer": "이동섭"
            }, {
                "proposer": "유승민"
            }]}):  #, {"analysisCheck":"0"}):
            count += 1
            billid = item.get('bill_id')
            billname = item.get('bill_name')
            summary = item.get('summary')

            # komoran은 빈줄이 있으면 에러가 남
            summary = summary.replace("\r",
                                      "").replace("\n",
                                                  " ").replace("\t", " ")
            summary = summary.replace("?", "ㆍ").replace(",", "")

            print(count, "번째")
            print(billname)
            print(summary)
            # print(item.get('summary').replace("\n", " ").replace("?", "ㆍ"))

            # 명사 추출
            nouns = self.komoran.nouns(summary)
            print("len(nouns) :", len(nouns))

            cnt = Counter(nouns)
            result_list = cnt.most_common(len(nouns))
            print("len(result_list) :", len(result_list))

            # List 객체인 결과를 Dict로 변경
            result_dict = {}  # Dict 객체 생성
            for i in range(len(result_list)):
                key = result_list[i][0]  # 단어
                value = result_list[i][1]  # count
                result_dict[key] = value


#             pprint(result_dict)

            row = {}  # Dict 객체 생성
            row['bill_no'] = item.get('bill_no')
            row['politician_no'] = item.get('politician_no')
            row['words'] = result_dict

            pprint(row)

            self.con['billAnalysis'].insert_one(row)
            print("==========================================================")

        print("요시! 입력 완료!")
        if a==wd.execute_script('return document.querySelector("div.css-111v2fw > div.css-0").scrollHeight') :
            break
        a=wd.execute_script('return document.querySelector("div.css-111v2fw > div.css-0").scrollHeight')
    
    except :
        break

end=time.time()

print((end-start)/60)

html_src = wd.page_source
soup=BeautifulSoup(html_src, 'html.parser')
comment=soup.find(class_='css-111v2fw').find_all(class_='css-cqdvbr')

extractor = Hannanum() # 얘를 써서 word 지도를 만들자!

nouns = []

# 전처리

for i in comment :
    if len(i.text) > 1 :
        if ('더보기' in i.text) & ('삭제되었습니다' not in i.text) :
            # print(i.text[:-3], end='\n\n')
            nouns.extend(extractor.nouns(i.text))
            
        elif '삭제되었습니다' not in i.text:
            # print(i.text, end='\n\n')
            nouns.extend(extractor.nouns(i.text))
        
Exemplo n.º 12
0
def calc_cfd(doc):
    # Calculate conditional frequency distribution of bigrams
    words = [w for w, t in Hannanum().pos(doc)]
    bigrams = nltk.bigrams(words)
    return nltk.ConditionalFreqDist(bigrams)
Exemplo n.º 13
0
def resultControl():
    if(request.method == 'POST'):

        # 클라이언트에서 sentenceId & wav file 받아옴
        wav = request.files['receiveFile']
        filename = request.form['fileName']
        sentenceId = request.form['sentenceId']

        # upload 디렉터리에 저장
        wav.save(FILE_DIRECTORY + secure_filename(wav.filename))

        ##### upload 디렉터리에 있는 파일을 STT로 변한

        # 임시 path
        args = easydict.EasyDict({"local_file_path": "./uploadFile/"+filename})

        # TODO Credential Error
        # print(sample_recognize(args.local_file_path))


        # sentenceId를 통해 DB에서 표준 발음 텍스트 가져옴
        Pick_sentence = db_session.query(Sentence).filter(Sentence.sentenceId == sentenceId).first()

        receiveSTTData = sample_recognize(args.local_file_path)
        receiveData = similaritySentence(receiveSTTData, Pick_sentence.standard)
        #receiveData = "날시가 참 말따"
        print("STT result : ", receiveData)

        # print(Pick_sentence)
        ##### 분석 알고리즘

        hannanum = Hannanum()

        StandardSentence = Pick_sentence.standard
        sentenceData = Pick_sentence.sentenceData

        # 공백 인덱스 리스트 생성
        # 공백 개수는 일치
        userBlankList = [i for i, value in enumerate(receiveData) if value == " "]
        standardBlankList = [i for i, value in enumerate(StandardSentence) if value == " "]
        # print(BlankList)

        # 문자열 길이가 다르거나 공백 개수가 다르면
        # 재시도 요청
        if (len(receiveData) != len(StandardSentence) or len(userBlankList) != len(standardBlankList)):
            os.remove("./uploadFile/"+filename)

            return jsonify(
                status="failure",
                resultData=receiveData,
                errorMessage="repeat",
            )

        # 공백 제거
        UserSentence = receiveData.replace(" ", "")
        StandardSentence = StandardSentence.replace(" ", "")
        sentenceData = sentenceData.replace(" ", "")

        # print(UserSentence)
        # print(StandardSentence)

        Total_pho = 0           # 총 음소 개수
        Wrong_total_pho = 0     # 틀린 음소 개수
        Wrong_word_index_list = []   # 틀린 글자 데이터가 들어있는 리스트
        Wrong_word_list = []         # 틀린 단어 데이터가 들어있는 리스트
        Wrong_pho_dict = {'u' : {},
                          'm' : {},     # 틀린 음소가 저장되는 딕셔너리
                          'b' : {}}     # u : 자음, m : 모음, b : 받침


        for index, standard in enumerate(StandardSentence):
            StandPho = phonemeConvert(standard)

            # 글자가 일치하는 경우
            if(UserSentence[index] == standard):
                if(StandPho[2] == ' '):
                    Total_pho += 2
                else:
                    Total_pho += 3
            # 글자가 일치하지 않는 경우
            # 음소 분해
            else:
                Wrong_word_index_list.append(index)
                UserPho = phonemeConvert(UserSentence[index])
                SentencePho = phonemeConvert(sentenceData[index])

                if(UserPho[2] == ' ' and StandPho[2] == ' '):
                    Total_pho += 2
                else:
                    Total_pho += 3

                    if(UserPho[2] != StandPho[2]):
                        Wrong_total_pho += 1

                        if StandPho[2] != ' ':
                            if StandPho[2] in Wrong_pho_dict['b']:
                                Wrong_pho_dict['b'][SentencePho[2]] += 1
                            else:
                                Wrong_pho_dict['b'][SentencePho[2]] = 1

                if (UserPho[0] != StandPho[0]):
                    Wrong_total_pho += 1
                    if StandPho[0] in Wrong_pho_dict['u']:
                        Wrong_pho_dict['u'][SentencePho[0]] += 1
                    else:
                        Wrong_pho_dict['u'][SentencePho[0]] = 1

                if (UserPho[1] != StandPho[1]):
                    Wrong_total_pho += 1
                    if StandPho[1] in Wrong_pho_dict['m']:
                        Wrong_pho_dict['m'][SentencePho[1]] += 1
                    else:
                        Wrong_pho_dict['m'][SentencePho[1]] = 1

        # print(Wrong_pho_dict)


        ######### 틀린 음소 record 테이블에 count 올림 -> TEST SUCCESS
        for type in Wrong_pho_dict:
            for pho in Wrong_pho_dict[type]:
                # print(pho)
                updateData = db_session.query(Record).filter(Record.recordType == type)\
                                                    .filter(Record.recordData == pho).first()
                # print(updateData.type, updateData.recordData)
                updateData.count += Wrong_pho_dict[type][pho]
                db_session.commit()



        # 일치율
        Correct_rate = round(1 - (Wrong_total_pho / Total_pho), 4)
        """
        # 일치율 100%인 경우
        if Correct_rate == 1:
            os.remove("./uploadFile/" + filename)

            return jsonify(
                status="perfect",
                resultData=receiveData,
                score=Correct_rate,
            )
        """
        # print(Wrong_word_list)

        # 변경 후
        # print(Wrong_word_index_list)
        sentenceData_split = Pick_sentence.sentenceData.split()
        # print(sentenceData_split)
        # 틀린 인덱스가 포함된 단어 선택
        word_start_point = 0
        for sentence_word in sentenceData_split:
            word_end_point = word_start_point + len(sentence_word)-1

            # print(word_start_point, word_end_point)

            for wrong_index in Wrong_word_index_list:
                if word_start_point <= wrong_index and word_end_point >= wrong_index:
                    word_to_pos = hannanum.pos(sentence_word)

                    # print(word_to_pos)
                    wrong_word_pho_list = phonemeConvert(sentenceData[wrong_index])
                    # print(wrong_word_pho_list)
                    for pos in word_to_pos:
                        #TODO 틀린 단어에 N이나 P가 여러개 들어있으면??
                        if pos[1] == 'N' or pos[1] == 'P':
                            for pos_word in pos[0]:
                                pos_word_pho_list = phonemeConvert(pos_word)
                                # print(pos_word_pho_list)
                                if wrong_word_pho_list[0] == pos_word_pho_list[0]:
                                    Wrong_word_list.append(pos)

                    break

            word_start_point += len(sentence_word)

        print(Wrong_word_list)

        # 틀린 글자 인덱스를 원래 문장을 기준으로 변경
        for i in userBlankList:
            for index, j in enumerate(Wrong_word_index_list):
                if(j >= i):
                    Wrong_word_index_list[index] += 1

        # print(Wrong_word_index)

        ######## result 테이블에 결과값 저장 -> TEST SUCCESS
        resultData = Result(stid=sentenceId, rsdata=receiveData, score=Correct_rate)
        db_session.add(resultData)
        db_session.commit()


        # 일치율 100%인 경우
        if Correct_rate == 1:
            os.remove("./uploadFile/" + filename)

            return jsonify(
                status="perfect",
                resultData=receiveData,
                score=Correct_rate,
            )

        ######## 가장 많이 틀린 단어에 대한 추천 문장 1개

        recommend_OtoD = dict(sentenceId=-1, sentenceData="", standard="")
        recommend_word = ""

        # 틀린 단어 리스트에 단어가 존재할 경우
        if Wrong_word_list:
            random.shuffle(Wrong_word_list)

            for random_select_word in Wrong_word_list:
                Word_query = db_session.query(Word).filter(Word.wordData == random_select_word[0])\
                    .filter(Word.wordType == random_select_word[1]).filter(Word.sentenceId != sentenceId)
                Word_entry = [pq.sentenceId for pq in Word_query]

                if Word_entry:
                    recommend_word = random_select_word[0]
                    if random_select_word[1] == 'P':
                        recommend_word += '다'
                    random_select_setencdId = random.choice(Word_entry)
                    Recommend_sentence = db_session.query(Sentence).filter(Sentence.sentenceId == random_select_setencdId).first()
                    recommend_OtoD['sentenceId'] = Recommend_sentence.sentenceId
                    recommend_OtoD['sentenceData'] = Recommend_sentence.sentenceData
                    recommend_OtoD['standard'] = Recommend_sentence.standard
                    break

    os.remove("./uploadFile/" + filename)

    # response해줄 틀린 단어 리스트
    wordList = []

    for w in Wrong_word_list:
        if w[1] == "P":
            wordList.append(w[0]+"다")
        else:
            wordList.append(w[0])

    # 결과 데이터를 모두 json으로 묶음
    return jsonify(
        status = "success",
        score = Correct_rate,
        wordList = wordList,
        userBlank = userBlankList,
        standardBlank = standardBlankList,
        wrongIndex = Wrong_word_index_list,
        resultData = receiveData
    )
from konlpy import data ###how to update user dictionary
from konlpy.tag import Kkma, Hannanum
from konlpy.utils import pprint
from collections import Counter
import pandas as pd
import numpy as np
import os
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt
from datetime import datetime 
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

os.getcwd()
os.chdir("C:/Users/Ri/Dropbox/paper/python_result")
h = Hannanum()  #한나눔 사전사용

#kkma=Kkma()


#Read text files and generate term documnet matrix(tdm)
def buildTDM(path):
    
    tdm_f=pd.DataFrame({'Term':[]})
    
    for filename in os.listdir(path):        
        
        #Read text files
        loc = path + '/' + filename 
        f = open(loc, 'rt',encoding='UTF8')
        contents = f.read()
Exemplo n.º 15
0
from collections import Counter

from konlpy.corpus import kolaw
from konlpy.tag import Hannanum
from konlpy.utils import concordance, pprint
from matplotlib import pyplot
def draw_zipf(count_list, filename, color='blue', marker='o'):
    sorted_list = sorted(count_list, reverse=True)
    pyplot.plot(sorted_list, color=color, marker=marker)
    pyplot.xscale('log')
    pyplot.yscale('log')
    pyplot.savefig(filename)


doc = kolaw.open('constitution.txt').read()
pos = Hannanum().pos(doc)
cnt = Counter(pos)

print('nchars  :', len(doc))
print('ntokens :', len(doc.split()))
print('nmorphs :', len(set(pos)))
print('\nTop 20 frequent morphemes:'); pprint(cnt.most_common(20))
print('\nLocations of "대한민국" in the document:')
concordance(u'대한민국', doc, show=True)

draw_zipf(cnt.values(), 'zipf.png')
Exemplo n.º 16
0
from .models import Log
from konlpy.tag import Hannanum
import re

hnn = Hannanum()
Exemplo n.º 17
0
def hannanum_instance():
    from konlpy import init_jvm
    from konlpy.tag import Hannanum
    init_jvm()
    h = Hannanum()
    return h
Exemplo n.º 18
0
 def __init__(self):
     self.komor = Komoran()
     self.hannan = Hannanum()
     self.kkma = Kkma()
     self.okt = Okt()
     self.news_list = []
Exemplo n.º 19
0
from homi import Service
from konlpy.tag import Hannanum

from .utils import attachThreadToJVM, check_options
from konlpy_homi.api.v0alpha import hannanum_pb2
from typed import StringArrayResponse, TupleDoubleArrayResponse, TupleArrayResponse

HANDLE_OPTIONS = {"ntags", "flatten", "join"}


def hannanum_option_checker(options: dict):
    check_options(HANDLE_OPTIONS, options)


engine = Hannanum()

hannanum_svc = Service(hannanum_pb2._HANNANUM)


@hannanum_svc.method()
@attachThreadToJVM
def Pos(payload: str, options: dict = None, **kwargs) -> TupleArrayResponse:
    options = options or {}
    hannanum_option_checker(options)
    resp = TupleArrayResponse(options=options, results=[])
    if options.get("join", False):
        resp['results'] = [{
            'keyword': keyword,
            'tag': None
        } for keyword in engine.pos(payload, **options)]
    else:
Exemplo n.º 20
0
# python version 3.5
# requirement library
# jpype1  version 0.6.2
# konlpy version 0.4.4

# system
# Linux Mint 18.1 Serena
# Linux version 4.4.0-53-generic (buildd@lcy01-28) (gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.4) ) #74-Ubuntu SMP Fri Dec 2 15:59:10 UTC 2016
# java version "1.7.0_80"
# Java(TM) SE Runtime Environment (build 1.7.0_80-b15)
# Java HotSpot(TM) 64-Bit Server VM (build 24.80-b11, mixed mode)

from konlpy.tag import Hannanum
from konlpy.tag import Twitter

H = Hannanum()
T = Twitter()
filter_pronouns = [
    '나', '너', '우리', '저', '저희', '그', '그녀', '그것', '것', '자기', '자네', '누구', '누구나',
    '아무', '아무나', '내'
]


def filtering_data(data: str, filter: list):
    '''
:param data: text data (type : str)
       filter : don't want data (If filter doesn't have pronouns, filtered text list has pronouns. So I recommend 
filter = filter + scaling.filter_pronouns

:return: filtered text list (type : str list)
Exemplo n.º 21
0
def _count(text):
    h = Hannanum()
    nouns = h.nouns(text)
    counted = Counter(nouns)
    return counted
Exemplo n.º 22
0
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from konlpy.tag import Hannanum
from konlpy.utils import pprint
# TF-IDF로서 처리될 때
# 한 콜에 대한 분석시에만 sentence 분리가 의미가 있다.
# 여러 콜. 즉, 1000콜 정도의 데이터 분석이면 콜 전체가 하나의 sentence로서 처리되면 된다.
from nltk.tokenize import sent_tokenize

from time import time

tagger = Hannanum()

tag_filter = ['NC', 'NQ', 'NB', 'NN', 'NP', 'PV', 'F']


# tagging and tokenizer
def _tokenizer(text):
    tokens = [
        '/'.join(tagged_word) for tagged_word in tagger.pos(text, ntags=22)
        if tagged_word[1] in tag_filter
    ]
    return tokens


sents = []  # document 전체를 담을 변수

for i in range(10):
    filename = "../data/news_data/news" + str(i) + ".txt"
    f = open(filename, "r")
    def __init__(self) -> None:
        super().__init__()

        self.hannanum = Hannanum()
Exemplo n.º 24
0
def root():
    url = 'http://chatsvr.run.goorm.io/message/@all'
    response = urllib.request.urlopen(url).read().decode('utf-8')
    json_return = json.loads(response)
    data = DF(json_return)

    li = ' '.join(data['text'])
    li = li.split()

    file = open("static/stopwords.txt")
    stop = file.readlines()
    for i in range(len(stop)):
        stop[i] = stop[i].strip()

    result = []
    for i in li:
        if i not in stop:
            result.append(i)

    li2 = ' '.join(result)
    li3 = re.sub('[!@#$%^&*()-+[]{}♥:;?/]', '', li2)
    li3 = re.sub('[ㄱ-ㅎㅏ-ㅣ0-9a-zA-Z]', '', li3)
    li3 = re.sub('[♥&;#]', '', li3)

    t = Hannanum()
    key = t.nouns(li3)
    for i, v in enumerate(key):
        if len(v) < 2:
            key.pop(i)

    count = Counter(key)
    tags = count.most_common(50)

    #num = Counter(key)

    font_path = 'static/NotoSans-Black.otf'
    wordcloud = WordCloud(font_path=font_path,
                          background_color="white",
                          width=800,
                          height=800).generate_from_frequencies(dict(tags))
    fig = plt.figure(figsize=(6, 6))
    plt.imshow(wordcloud)
    plt.axis('off')
    fig.savefig('static/wordcloud.png')

    #fig2 = plt.figure(figsize=(6,6))
    data2 = DF(tags)
    data2.columns = ['sym', 'freq']

    trace1 = go.Bar(x=data2['sym'], y=data2['freq'])
    dataset = [trace1]
    layout = go.Layout()
    fig2 = go.Figure(data=dataset, layout=layout)

    #fig2.wirte_image('static/plot2.png')
    #po.write_html(fig2, file="static/plot.html")
    #fig2 = pyo.iplot(fig2)
    #po.write_image(fig2, file='static/plot2.png')
    #plot=Image(po.to_image(fig2, format="png"), width=576, height=576)

    fig2.write_image("static/plot.png", width=800, height=800)

    return render_template('wordcloud.html', time=time.time())
Exemplo n.º 25
0
        for tag in com_date_info:
            a = tag.select('a')
            for ext_tag in a:
                ext_tag.extract()
            span = tag.select('span')
            for ext_tag in span:
                ext_tag.extract()
            date_info.append(tag.getText().strip())

        source = ar.select_one("dl > dd:nth-of-type(2)").text
        source = source.replace(",", "")
        #print(title.text.replace(",", "") +'\n' +com_info  +" / "+ date_info[0]  +'\n' +title.attrs['href']+ "\n"+ ":" + source + '\n')
        han1 += (title.text.replace(",", "") + '\n' + com_info + " / " +
                 date_info[0] + '\n' + title.attrs['href'] + "\n" + ":" +
                 source + '\n')
han = Hannanum()
text = han.nouns(han1)
text = " ".join(text)
mask = np.array(Image.open('./data/back.png'))
wordcloud = WordCloud(
    width=6000,
    height=3000,
    font_path='/usr/share/fonts/truetype/nanum/NanumGothic_Coding.ttf',
    max_font_size=100,
    background_color='white',
    mask=mask).generate(text)

fig = plt.figure(figsize=(60, 30))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('cloud.png')
Exemplo n.º 26
0
def token(request):
    try:
        with open(
                'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc1.txt',
                'r',
                encoding='UTF-8') as r:
            doc1 = r.readlines()
        with open(
                'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc2.txt',
                'r',
                encoding='UTF-8') as r:
            doc2 = r.readlines()
        with open(
                'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc3.txt',
                'r',
                encoding='UTF-8') as r:
            doc3 = r.readlines()
        hangul = re.compile('[\u3131-\u3163\uac00-\ud7a3]+')
        doc1_ko = hangul.findall(doc1[0])  #한글 추출
        doc2_ko = hangul.findall(doc2[0])
        doc3_ko = hangul.findall(doc3[0])

        str1 = ''  # list -> str
        str2 = ''
        str3 = ''
        for x in doc1_ko:
            str1 += x + ' '
        for x in doc2_ko:
            str2 += x + ' '
        for x in doc3_ko:
            str3 += x + ' '

        hannanum = Hannanum()  #알고리름 설정
        doc1_ko_nouns_list = hannanum.nouns(str1)
        doc2_ko_nouns_list = hannanum.nouns(str2)
        doc3_ko_nouns_list = hannanum.nouns(str3)

        with open(
                'C:\\Users\osk\DjangoProject\info_search\oper\data/stop_word_ko.txt',
                'r',
                encoding='UTF-8') as r:
            stopword_ko = r.readlines()

        stwd = []
        for x in stopword_ko:  #\n 제거
            stwd.append(x.replace('\n', ''))

        for x in doc1_ko_nouns_list:  #한글 불용어 제거
            if x in stwd:
                doc1_ko_nouns_list.remove(x)
        for x in doc2_ko_nouns_list:  #한글 불용어 제거
            if x in stwd:
                doc2_ko_nouns_list.remove(x)
        for x in doc3_ko_nouns_list:  #한글 불용어 제거
            if x in stwd:
                doc3_ko_nouns_list.remove(x)

        with open(
                'C:\\Users\osk\DjangoProject\info_search\oper\data/doc1_ko.txt',
                'w',
                encoding='UTF-8') as w:
            for x in doc1_ko_nouns_list:  # 한글 nouns 저장
                if x == '.' or x == ',':
                    continue
                w.write(x + ' ')

        with open(
                'C:\\Users\osk\DjangoProject\info_search\oper\data/doc2_ko.txt',
                'w',
                encoding='UTF-8') as w:
            for x in doc2_ko_nouns_list:  # 한글 nouns 저장
                if x == '.' or x == ',':
                    continue
                w.write(x + ' ')

        with open(
                'C:\\Users\osk\DjangoProject\info_search\oper\data/doc3_ko.txt',
                'w',
                encoding='UTF-8') as w:
            for x in doc3_ko_nouns_list:  # 한글 nouns 저장
                if x == '.' or x == ',':
                    continue
                w.write(x + ' ')

        doc1_en = hangul.sub('', doc1[0])  # 영어 분석하기전에 한글 제거
        doc2_en = hangul.sub('', doc2[0])
        doc3_en = hangul.sub('', doc3[0])
        token_doc1 = word_tokenize(doc1_en)  # 영어만 토큰화 진행
        token_doc2 = word_tokenize(doc2_en)
        token_doc3 = word_tokenize(doc3_en)
        if len(
                doc1
        ) == 1:  #readlines로 파일을 읽었을 때 len가 1일때만 token한 파일을 write 하기 위한 조건문
            with open(
                    'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc1.txt',
                    'a',
                    encoding='UTF-8') as w:
                w.write('\n')
                for x in token_doc1:
                    if x == '.' or x == ',':  #온점, 반점 제거
                        continue
                    w.write(x.lower() + ' ')  #문자를 소문자로 전환 .lower()
            with open(
                    'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc2.txt',
                    'a',
                    encoding='UTF-8') as w:
                w.write('\n')
                for x in token_doc2:
                    if x == '.' or x == ',':
                        continue
                    w.write(x.lower() + ' ')
            with open(
                    'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc3.txt',
                    'a',
                    encoding='UTF-8') as w:
                w.write('\n')
                for x in token_doc3:
                    if x == '.' or x == ',':
                        continue
                    w.write(x.lower() + ' ')

        return render(
            request, 'oper/processing.html', {
                'process': '토큰화',
                'token_doc1': token_doc1,
                'token_doc2': token_doc2,
                'token_doc3': token_doc3
            })
    except FileNotFoundError:
        return redirect('/home')
Exemplo n.º 27
0
from wordcloud import WordCloud
from collections import Counter
from re import match
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from konlpy.tag import Hannanum

data = pd.read_csv("/Users/doyun/2020_text_mining/jobkorea_data.csv")

hannanum = Hannanum()

ap_progra = data.loc[data['직무분야'] == "ERP·시스템분석·설계", "답변"]
nouns = hannanum.nouns(''.join(str(ap_progra.fillna(''))))
nouns = [n for n in nouns if len(n) > 1]
nouns = [n for n in nouns if not (match('^[0-9]', n))]
count = Counter(nouns)
top = count.most_common(40)

my_font_path = '/Users/doyun/Library/Fonts/NanumBarunGothic.ttf'

wordcloud = WordCloud(font_path=my_font_path,
                      background_color='white',
                      width=800,
                      height=600)
cloud = wordcloud.generate_from_frequencies(dict(top))
plt.figure(figsize=(10, 8))
plt.axis('off')
plt.imshow(구름)
Exemplo n.º 28
0
def Testhansuk():
    #data = "
    return data


def TestHong(str):
    # if str == ""
    #     data = "mclab.hufs.ac.kr"
    # else str =="google"테스트
    #     data = "http://www.google.com"
    return str
#########################################################################################

from konlpy.tag import Hannanum

mecab = Hannanum()
sen = '인스타그램에서 야식음식에서 간장을 제외하고 후라이드는 해쉬태그'

#print(mecab.pos(sen))
def first(sen):
    sub = dict(sen)

    li = []

    for key, value in sub.items():
        if value == 'N':
            sub.items()
            li.append(key)
        elif value == 'F':
            li.append(key)
    reject(li)
Exemplo n.º 29
0
import time
start = time.time()
from konlpy.tag import Hannanum
doc = input("문장을 입력해주세요 : ")  ## 문장을 입력받는다
n = input("입력문장과 유사한 몇개의 문장을 출력할까요? : ")  ## 유사한 문장을 몇개 출력할지 입력 받는다
print(" ")
hannanum = Hannanum()  ##형태소 분석을 위하여 konlpy의 Hannanum class를 이용하여 준다
doc_tokenized = hannanum.morphs(doc)  ##입력문장을 형태소 단위로 쪼개준다
doc_tokenized_size = len(doc_tokenized)  ##입력문장의 형태소 개수

list = []

with open('KCCq28.txt', 'r', encoding='utf-8') as input:  # 유사도 검사할 말뭉치를 가져옴
    for line in input:  ##말뭉치에서 라인 단위로 읽어준다
        file_tokenized = hannanum.morphs(line)  ##읽어들인 라인을 형태소 단위로 쪼개준다
        file_tokenized_size = len(file_tokenized)  ##읽어들인 라인의 형태소 개수
        intersection_size = 0  ##교집합 형태소 개수를 초기화
        for x in doc_tokenized:
            if x in file_tokenized:
                intersection_size += 1  ##입력문장과 라인문장을 비교하여 겹치는 형태소가 있다면 교집합 형태소 개수 증가시키기
        if len(doc) <= len(line):
            short = doc_tokenized_size
        else:
            short = file_tokenized_size  ## 입력문장과 파일 각 문장의(라인단위) 길이를 비교하여 짧은 문장의 형태소 개수를 short에 저장
        similarity = float(intersection_size) / float(
            short)  ## 공통 형태소 개수 / 짧은 문장소 형태소 개수 = 유사도
        list.append([line, similarity * 100
                     ])  ##list 배열에 문장과, 그문장의 입력문장에 대한 유사도를 집어넣어준다

    sorted_list = sorted(
        list,
Exemplo n.º 30
0
sentence = u'내년도 최저임금을 기존 방식대로 전체 업종에 동일하게 적용하기로 결정했다.\
최저임금의 업종별 차등 적용을 요구해온 사용자위원들은 이에 반발해 전원회의에서 퇴장했다.\
최저임금위원회 사용자위원들은 이날 오후 정부세종청사에서 열린 최저임금위원회 제5차 전원회의 도중 퇴장해 기자들과 만나 \
"금일 최저임금위원회는 최저임금 고시에 월 환산액을 병기하고 2020년 최저임금을 모든 업종에 동일하게 적용하기로 결정했다"고 밝혔다.'
sentences = [sentence] * 10000

import time
from konlpy.tag import Hannanum, Kkma, Komoran, Okt, Mecab
from khaiii import KhaiiiApi
api = KhaiiiApi()
morphs_processors= [('Hannanum', Hannanum()), ('Kkma', Kkma()), ('Komoran', Komoran()), ('Okt', Okt()), ('mecab', Mecab())]
for name, morphs_processor in morphs_processors:
    strat_time = time.time()
    morphs = [morphs_processor.pos(sentence) for sentence in sentences]                                          
    elapsed_time = time.time() - strat_time
    print('morphs_processor name = %20s, %.5f secs' % (name, elapsed_time))
strat_time = time.time()
morphs = [api.analyze(sentence) for sentence in sentences]
elapsed_time = time.time() - strat_time
print('morphs_processor name = %20s, %.5f secs' % ('khaiii', elapsed_time))