if len(data)>2: KNU_df.loc[count,"ngram"] = " ".join(data[:-1]) else: KNU_df.loc[count, "ngram"] = data[0] KNU_df.loc[count,"sentiment"] = data[-1] count+=1 print(KNU_df) KNU_df.to_csv("KNU_lexicon.tsv",encoding="utf8",sep="\t", index=False) ''' KNU_df = pd.read_csv("KNU_lexicon.tsv", encoding="utf8", sep="\t") #pos_tagger = Kkma() #print(pos_tagger.pos(u"함찬")) #KNU_df["ngram"].apply(pos_tagger.pos) print(KNU_df) data = ";".join(SNU_df["ngram"]).split(";") data = list(map((lambda x: list(x.split("/"))), data)) pos = sorted(set(np.array(data)[:, -1])) kkma = sorted(set(Kkma().tagset.keys())) komoran = sorted(set(Komoran().tagset.keys())) hannanum = sorted(set(Hannanum().tagset.keys())) print(len(pos), "+", len(kkma), "=", len(pos + kkma) - len(set(pos + kkma))) print(len(pos), "+", len(komoran), "=", len(pos + komoran) - len(set(pos + komoran))) print(len(pos), "+", len(hannanum), "=", len(pos + hannanum) - len(set(pos + hannanum)))
from konlpy.tag import Hannanum from collections import Counter import matplotlib.pyplot as plt from wordcloud import WordCloud file_name = input("엑셀 파일 이름을 확장자 제외하고 넣으세요 \n") number = int(input("상위 n개의 단어를 워드클라우드로 만듭니다. n을 넣으세요. ex)10 \n" )) f = open(RESULT_PATH + file_name + '.txt', 'r', encoding="UTF-8") #다시 읽기 data = f.read() try: engin = Hannanum() nouns = engin.nouns(data) nouns = [ n for n in nouns if len(n) > 1 ] count = Counter(nouns) tags = count.most_common(number) print(tags) wordcloud = WordCloud(font_path="C:/Windows/Fonts/malgun.ttf", background_color='white', width=640, height=480) wordcloud.generate_from_frequencies(dict(tags)) fig = plt.figure() plt.axis('off') plt.imshow(wordcloud)
def get_tags(text, ntags=50, multiplier=10): h = Hannanum() nouns = h.nouns(text) count = Counter(nouns) return [{ 'color': color(), 'tag': n, 'size': c*multiplier }\ for n, c in count.most_common(ntags)]
# 설치한 matplotlib 외부라이브러리의 pyplot 기능을 사용하며, pyplot 기능을 약어로 plt로 정의 import matplotlib.pyplot as plt # 설치한 konlpy 외부라이브러리로부터 Hannanum 기능 사용하도록 설정 from konlpy.tag import Hannanum # 문자열(문장) 수정을 위한 파이썬 기본 기능 추가 import re # 분석 대상 문장 / 단어별 구분자 스페이스(공백 한칸)를 이용함 text = open("contents.txt", encoding="UTF-8").read() print(text) # myHannanum 변수에 Hannanum 기능을 넣어줌 myHannanum = Hannanum() # 단어 분석의 정확도를 높이기 위해 특수문자 제거 # 특수문자는 키보드 상단 숫자패드의 특수문자가 발견되면 한칸 공백으로 변경 replace_text = re.sub("[!@#$%^&*()_+]", " ", text) # 특수문자가 제거된 문장 출력 print(replace_text) # 명사 분석 결과는 여러 단어들이 저장된 배열형태로 데이터를 생성하기 때문에 배열을 문자열로 변경하기위해 # join 함수를 사용하며, analysis_text 변수에 문자열로 변환된 결과를 저장함 analysis_text = (" ".join(myHannanum.nouns(replace_text))) # stopwords 변수에 원하지 않는 단어들 추가 stopwords = set(STOPWORDS) # stopwords.add("분석")
from konlpy.tag import Hannanum import numpy as np import keras.preprocessing.text hashing_dimension = 1000 # bad programming practice lol global_hannanum = Hannanum() def tokenize(text): return list(p[0] + "/" + p[1] for p in global_hannanum.pos(text)) def read_data_file(data_filename): rows = [] with open(data_filename, 'r') as file: rows = file.readlines() parsed = list(map(parse_row, rows)) (X, y) = divide_to_xy(parsed) return (hash_data(X), hash_data(y)) # returns a tuple of form # sbj entity # obj entity # relation # original source sentence (ie. sbj and obj tags replaced) tokenized def parse_row(data_row): parts = data_row.split("\t")
@author: Ri """ from konlpy import data ###how to update user dictionary from konlpy.tag import Kkma, Hannanum from konlpy.utils import pprint from collections import Counter import pandas as pd import numpy as np import os from scipy.spatial.distance import cosine import matplotlib.pyplot as plt os.getcwd() os.chdir("C:/Users/Ri/Dropbox/paper/python_result") h = Hannanum() #한나눔 사전(카이스트) k = Kkma() #꼬꼬마형태소 분석기(서울대 IDS) def mergingNews(filepath): news=pd.DataFrame() for filename in os.listdir(filepath): print(filename) loc = filepath + '/' + filename f = pd.read_csv(loc, encoding='UTF8') f['datetime']=f['datetime'].apply(lambda x: x[:10]) f['body']=f['body'].map(lambda x: x.replace(u"// flash 오류를 우회하기 위한 함수 추가 function _flash_removeCallback() {}","")) f['contents']=f['title'].map(str)+"/"+f['body'] f['filename']=filename[:-4] news=news.append(f) # news.to_csv("news.csv")
# 종성 범위 밖에 있는 것들은 end_char로 메꿔준다. if jongsung_index == 0: jongsung = end_char result.append(chosung) result.append(joongsung) result.append(jongsung) return "".join(result) # khaiii khaiii = KhaiiiApi() def khaiii_tokenize(text): tokens = [] for word in khaiii.analyze(text): tokens.extend([str(m).split('/')[0] for m in word.morphs]) return tokens # konlpy tokenizers mecab = Mecab().morphs okt = Okt().morphs komoran = Komoran().morphs hannanum = Hannanum().morphs # 오류 발생 kkma = Kkma().morphs def space_tokenizer(text): return text.split(' ') def char_tokenizer(text): return [t for t in text]
def wordcloud(self, period, topic): """ 한 기간의 댓글 모음을 이용하여 빈도수로 워드 클라우드를 생성하여 반환 :return: wordcloud 이미지 """ # 일단 텍스트 파일에서 가져오는 걸로 해놈 text = '' THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) file_reactions = os.path.join(THIS_FOLDER, "%s_%d.txt" % (topic, period)) # with open(os.path.join(settings.BASE_DIR, file_reactions), encoding="utf-8") as f: with open(file_reactions, encoding="utf-8") as f: text = f.read() #first_char = f.read(1) if not text: print("file is empty") # 임시 텍스트 파일 삭제 os.remove(file_reactions) return else: print("file is full") # 한나눔 사용 변수 지정 hannanum = Hannanum() # 명사 분석 nouns = hannanum.nouns(text) # print(nouns) # midle_time = time.time() # 명사 중에 길이가 2이상일때만 words = [] for n in nouns: if len(n) > 1: words.append(n) # 빈도수 계산 count = Counter(words) #count = Counter(nouns) # 상위 100건 추출 most = count.most_common(100) # 딕셔너리 구성 tags = {} for n, c in most: tags[n] = c # 워드 클라우드 생성 print(topic) file_name = (topic + "_%d" % self.model_topic.id + "_%d.png" % period) # topic 2개이면 파일명 'topic1_topic2_10_1.png' mask = np.array(Image.open(os.path.join(THIS_FOLDER, 'mask_img_512px.png'))) wc_image_gen = wc(font_path=os.path.join(THIS_FOLDER, 'NanumSquareEB.ttf'), mask=mask, background_color='white' ).generate_from_frequencies(tags) # wordcloud 용 댓글 하나도 없을 경우.. wc_image_gen.to_file(os.path.join(THIS_FOLDER,file_name)) # 경로에 이미지 파일 생성 #file_reactions = os.path.join(THIS_FOLDER, "%s_%d.txt" % (topic, period)) with open(os.path.join(THIS_FOLDER, file_name), 'rb') as tmp_file: # tmp_wc = WordCloud(period=period, topic=self.model_topic) # get해서 해야됨. -> django db 에 저장 tmp_wc = WordCloud.objects.filter(topic=self.model_topic, period=period).first() tmp_wc.wcphoto.save(file_name, File(tmp_file)) # 임시 이미지 파일 삭제 os.remove(os.path.join(THIS_FOLDER, file_name)) # 임시 텍스트 파일 삭제 os.remove(file_reactions) """
from konlpy.tag import Mecab, Okt, Komoran, Hannanum, Kkma from khaiii_pos.khaiii_pos import Khaiii import json tokenizers = { 'Mecab':Mecab(), 'Okt':Okt(), 'Komoran':Komoran(), 'Hannanum':Hannanum(), 'Kkma':Kkma(), 'Khaiii':Khaiii() } corpus_fname = 'raw/korquad/KorQuAD_v1.0_train.json' with open(corpus_fname) as f_corpus: dataset_json = json.load(f_corpus) dataset = dataset_json['data'] for i, article in enumerate(dataset): # w_lines = [] if i > 10 : break for paragraph in article['paragraphs']: print('--------------------------------------') print('[ORIGINAL] ', paragraph['context']) for key, tokenizer in tokenizers.items(): print('[%8s] %s' % (key, tokenizer.pos(paragraph['context'])))
class konlpy(): os.environ[ "NLS_LANG"] = ".AL32UTF8" # UTF-8 : .AL32UTF8, CP949 : .KO16MSWIN949 con = pymongo.MongoClient("localhost", 27017)['jcjc'] # conn = cx_Oracle.connect('bigdata/admin1234@localhost:1521/xe') # oracle 서버와 연결 (connection 맺기) # userdic : 단어 추출에서 제외할 목록들이 들어갈 단어사전 komoran = Komoran( userdic='C:\\Project\\morphemeAnalysis\\com\\test\\user_dic.txt') hannanum = Hannanum() def analyze(self): count = 0 for item in self.con['bill'].find( {"$or": [{ "proposer": "이동섭" }, { "proposer": "유승민" }]}): #, {"analysisCheck":"0"}): count += 1 billid = item.get('bill_id') billname = item.get('bill_name') summary = item.get('summary') # komoran은 빈줄이 있으면 에러가 남 summary = summary.replace("\r", "").replace("\n", " ").replace("\t", " ") summary = summary.replace("?", "ㆍ").replace(",", "") print(count, "번째") print(billname) print(summary) # print(item.get('summary').replace("\n", " ").replace("?", "ㆍ")) # 명사 추출 nouns = self.komoran.nouns(summary) print("len(nouns) :", len(nouns)) cnt = Counter(nouns) result_list = cnt.most_common(len(nouns)) print("len(result_list) :", len(result_list)) # List 객체인 결과를 Dict로 변경 result_dict = {} # Dict 객체 생성 for i in range(len(result_list)): key = result_list[i][0] # 단어 value = result_list[i][1] # count result_dict[key] = value # pprint(result_dict) row = {} # Dict 객체 생성 row['bill_no'] = item.get('bill_no') row['politician_no'] = item.get('politician_no') row['words'] = result_dict pprint(row) self.con['billAnalysis'].insert_one(row) print("==========================================================") print("요시! 입력 완료!")
if a==wd.execute_script('return document.querySelector("div.css-111v2fw > div.css-0").scrollHeight') : break a=wd.execute_script('return document.querySelector("div.css-111v2fw > div.css-0").scrollHeight') except : break end=time.time() print((end-start)/60) html_src = wd.page_source soup=BeautifulSoup(html_src, 'html.parser') comment=soup.find(class_='css-111v2fw').find_all(class_='css-cqdvbr') extractor = Hannanum() # 얘를 써서 word 지도를 만들자! nouns = [] # 전처리 for i in comment : if len(i.text) > 1 : if ('더보기' in i.text) & ('삭제되었습니다' not in i.text) : # print(i.text[:-3], end='\n\n') nouns.extend(extractor.nouns(i.text)) elif '삭제되었습니다' not in i.text: # print(i.text, end='\n\n') nouns.extend(extractor.nouns(i.text))
def calc_cfd(doc): # Calculate conditional frequency distribution of bigrams words = [w for w, t in Hannanum().pos(doc)] bigrams = nltk.bigrams(words) return nltk.ConditionalFreqDist(bigrams)
def resultControl(): if(request.method == 'POST'): # 클라이언트에서 sentenceId & wav file 받아옴 wav = request.files['receiveFile'] filename = request.form['fileName'] sentenceId = request.form['sentenceId'] # upload 디렉터리에 저장 wav.save(FILE_DIRECTORY + secure_filename(wav.filename)) ##### upload 디렉터리에 있는 파일을 STT로 변한 # 임시 path args = easydict.EasyDict({"local_file_path": "./uploadFile/"+filename}) # TODO Credential Error # print(sample_recognize(args.local_file_path)) # sentenceId를 통해 DB에서 표준 발음 텍스트 가져옴 Pick_sentence = db_session.query(Sentence).filter(Sentence.sentenceId == sentenceId).first() receiveSTTData = sample_recognize(args.local_file_path) receiveData = similaritySentence(receiveSTTData, Pick_sentence.standard) #receiveData = "날시가 참 말따" print("STT result : ", receiveData) # print(Pick_sentence) ##### 분석 알고리즘 hannanum = Hannanum() StandardSentence = Pick_sentence.standard sentenceData = Pick_sentence.sentenceData # 공백 인덱스 리스트 생성 # 공백 개수는 일치 userBlankList = [i for i, value in enumerate(receiveData) if value == " "] standardBlankList = [i for i, value in enumerate(StandardSentence) if value == " "] # print(BlankList) # 문자열 길이가 다르거나 공백 개수가 다르면 # 재시도 요청 if (len(receiveData) != len(StandardSentence) or len(userBlankList) != len(standardBlankList)): os.remove("./uploadFile/"+filename) return jsonify( status="failure", resultData=receiveData, errorMessage="repeat", ) # 공백 제거 UserSentence = receiveData.replace(" ", "") StandardSentence = StandardSentence.replace(" ", "") sentenceData = sentenceData.replace(" ", "") # print(UserSentence) # print(StandardSentence) Total_pho = 0 # 총 음소 개수 Wrong_total_pho = 0 # 틀린 음소 개수 Wrong_word_index_list = [] # 틀린 글자 데이터가 들어있는 리스트 Wrong_word_list = [] # 틀린 단어 데이터가 들어있는 리스트 Wrong_pho_dict = {'u' : {}, 'm' : {}, # 틀린 음소가 저장되는 딕셔너리 'b' : {}} # u : 자음, m : 모음, b : 받침 for index, standard in enumerate(StandardSentence): StandPho = phonemeConvert(standard) # 글자가 일치하는 경우 if(UserSentence[index] == standard): if(StandPho[2] == ' '): Total_pho += 2 else: Total_pho += 3 # 글자가 일치하지 않는 경우 # 음소 분해 else: Wrong_word_index_list.append(index) UserPho = phonemeConvert(UserSentence[index]) SentencePho = phonemeConvert(sentenceData[index]) if(UserPho[2] == ' ' and StandPho[2] == ' '): Total_pho += 2 else: Total_pho += 3 if(UserPho[2] != StandPho[2]): Wrong_total_pho += 1 if StandPho[2] != ' ': if StandPho[2] in Wrong_pho_dict['b']: Wrong_pho_dict['b'][SentencePho[2]] += 1 else: Wrong_pho_dict['b'][SentencePho[2]] = 1 if (UserPho[0] != StandPho[0]): Wrong_total_pho += 1 if StandPho[0] in Wrong_pho_dict['u']: Wrong_pho_dict['u'][SentencePho[0]] += 1 else: Wrong_pho_dict['u'][SentencePho[0]] = 1 if (UserPho[1] != StandPho[1]): Wrong_total_pho += 1 if StandPho[1] in Wrong_pho_dict['m']: Wrong_pho_dict['m'][SentencePho[1]] += 1 else: Wrong_pho_dict['m'][SentencePho[1]] = 1 # print(Wrong_pho_dict) ######### 틀린 음소 record 테이블에 count 올림 -> TEST SUCCESS for type in Wrong_pho_dict: for pho in Wrong_pho_dict[type]: # print(pho) updateData = db_session.query(Record).filter(Record.recordType == type)\ .filter(Record.recordData == pho).first() # print(updateData.type, updateData.recordData) updateData.count += Wrong_pho_dict[type][pho] db_session.commit() # 일치율 Correct_rate = round(1 - (Wrong_total_pho / Total_pho), 4) """ # 일치율 100%인 경우 if Correct_rate == 1: os.remove("./uploadFile/" + filename) return jsonify( status="perfect", resultData=receiveData, score=Correct_rate, ) """ # print(Wrong_word_list) # 변경 후 # print(Wrong_word_index_list) sentenceData_split = Pick_sentence.sentenceData.split() # print(sentenceData_split) # 틀린 인덱스가 포함된 단어 선택 word_start_point = 0 for sentence_word in sentenceData_split: word_end_point = word_start_point + len(sentence_word)-1 # print(word_start_point, word_end_point) for wrong_index in Wrong_word_index_list: if word_start_point <= wrong_index and word_end_point >= wrong_index: word_to_pos = hannanum.pos(sentence_word) # print(word_to_pos) wrong_word_pho_list = phonemeConvert(sentenceData[wrong_index]) # print(wrong_word_pho_list) for pos in word_to_pos: #TODO 틀린 단어에 N이나 P가 여러개 들어있으면?? if pos[1] == 'N' or pos[1] == 'P': for pos_word in pos[0]: pos_word_pho_list = phonemeConvert(pos_word) # print(pos_word_pho_list) if wrong_word_pho_list[0] == pos_word_pho_list[0]: Wrong_word_list.append(pos) break word_start_point += len(sentence_word) print(Wrong_word_list) # 틀린 글자 인덱스를 원래 문장을 기준으로 변경 for i in userBlankList: for index, j in enumerate(Wrong_word_index_list): if(j >= i): Wrong_word_index_list[index] += 1 # print(Wrong_word_index) ######## result 테이블에 결과값 저장 -> TEST SUCCESS resultData = Result(stid=sentenceId, rsdata=receiveData, score=Correct_rate) db_session.add(resultData) db_session.commit() # 일치율 100%인 경우 if Correct_rate == 1: os.remove("./uploadFile/" + filename) return jsonify( status="perfect", resultData=receiveData, score=Correct_rate, ) ######## 가장 많이 틀린 단어에 대한 추천 문장 1개 recommend_OtoD = dict(sentenceId=-1, sentenceData="", standard="") recommend_word = "" # 틀린 단어 리스트에 단어가 존재할 경우 if Wrong_word_list: random.shuffle(Wrong_word_list) for random_select_word in Wrong_word_list: Word_query = db_session.query(Word).filter(Word.wordData == random_select_word[0])\ .filter(Word.wordType == random_select_word[1]).filter(Word.sentenceId != sentenceId) Word_entry = [pq.sentenceId for pq in Word_query] if Word_entry: recommend_word = random_select_word[0] if random_select_word[1] == 'P': recommend_word += '다' random_select_setencdId = random.choice(Word_entry) Recommend_sentence = db_session.query(Sentence).filter(Sentence.sentenceId == random_select_setencdId).first() recommend_OtoD['sentenceId'] = Recommend_sentence.sentenceId recommend_OtoD['sentenceData'] = Recommend_sentence.sentenceData recommend_OtoD['standard'] = Recommend_sentence.standard break os.remove("./uploadFile/" + filename) # response해줄 틀린 단어 리스트 wordList = [] for w in Wrong_word_list: if w[1] == "P": wordList.append(w[0]+"다") else: wordList.append(w[0]) # 결과 데이터를 모두 json으로 묶음 return jsonify( status = "success", score = Correct_rate, wordList = wordList, userBlank = userBlankList, standardBlank = standardBlankList, wrongIndex = Wrong_word_index_list, resultData = receiveData )
from konlpy import data ###how to update user dictionary from konlpy.tag import Kkma, Hannanum from konlpy.utils import pprint from collections import Counter import pandas as pd import numpy as np import os from scipy.spatial.distance import cosine import matplotlib.pyplot as plt from datetime import datetime from sklearn.decomposition import TruncatedSVD from sklearn.preprocessing import Normalizer os.getcwd() os.chdir("C:/Users/Ri/Dropbox/paper/python_result") h = Hannanum() #한나눔 사전사용 #kkma=Kkma() #Read text files and generate term documnet matrix(tdm) def buildTDM(path): tdm_f=pd.DataFrame({'Term':[]}) for filename in os.listdir(path): #Read text files loc = path + '/' + filename f = open(loc, 'rt',encoding='UTF8') contents = f.read()
from collections import Counter from konlpy.corpus import kolaw from konlpy.tag import Hannanum from konlpy.utils import concordance, pprint from matplotlib import pyplot def draw_zipf(count_list, filename, color='blue', marker='o'): sorted_list = sorted(count_list, reverse=True) pyplot.plot(sorted_list, color=color, marker=marker) pyplot.xscale('log') pyplot.yscale('log') pyplot.savefig(filename) doc = kolaw.open('constitution.txt').read() pos = Hannanum().pos(doc) cnt = Counter(pos) print('nchars :', len(doc)) print('ntokens :', len(doc.split())) print('nmorphs :', len(set(pos))) print('\nTop 20 frequent morphemes:'); pprint(cnt.most_common(20)) print('\nLocations of "대한민국" in the document:') concordance(u'대한민국', doc, show=True) draw_zipf(cnt.values(), 'zipf.png')
from .models import Log from konlpy.tag import Hannanum import re hnn = Hannanum()
def hannanum_instance(): from konlpy import init_jvm from konlpy.tag import Hannanum init_jvm() h = Hannanum() return h
def __init__(self): self.komor = Komoran() self.hannan = Hannanum() self.kkma = Kkma() self.okt = Okt() self.news_list = []
from homi import Service from konlpy.tag import Hannanum from .utils import attachThreadToJVM, check_options from konlpy_homi.api.v0alpha import hannanum_pb2 from typed import StringArrayResponse, TupleDoubleArrayResponse, TupleArrayResponse HANDLE_OPTIONS = {"ntags", "flatten", "join"} def hannanum_option_checker(options: dict): check_options(HANDLE_OPTIONS, options) engine = Hannanum() hannanum_svc = Service(hannanum_pb2._HANNANUM) @hannanum_svc.method() @attachThreadToJVM def Pos(payload: str, options: dict = None, **kwargs) -> TupleArrayResponse: options = options or {} hannanum_option_checker(options) resp = TupleArrayResponse(options=options, results=[]) if options.get("join", False): resp['results'] = [{ 'keyword': keyword, 'tag': None } for keyword in engine.pos(payload, **options)] else:
# python version 3.5 # requirement library # jpype1 version 0.6.2 # konlpy version 0.4.4 # system # Linux Mint 18.1 Serena # Linux version 4.4.0-53-generic (buildd@lcy01-28) (gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.4) ) #74-Ubuntu SMP Fri Dec 2 15:59:10 UTC 2016 # java version "1.7.0_80" # Java(TM) SE Runtime Environment (build 1.7.0_80-b15) # Java HotSpot(TM) 64-Bit Server VM (build 24.80-b11, mixed mode) from konlpy.tag import Hannanum from konlpy.tag import Twitter H = Hannanum() T = Twitter() filter_pronouns = [ '나', '너', '우리', '저', '저희', '그', '그녀', '그것', '것', '자기', '자네', '누구', '누구나', '아무', '아무나', '내' ] def filtering_data(data: str, filter: list): ''' :param data: text data (type : str) filter : don't want data (If filter doesn't have pronouns, filtered text list has pronouns. So I recommend filter = filter + scaling.filter_pronouns :return: filtered text list (type : str list)
def _count(text): h = Hannanum() nouns = h.nouns(text) counted = Counter(nouns) return counted
# -*- coding: utf-8 -*- from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import NMF from konlpy.tag import Hannanum from konlpy.utils import pprint # TF-IDF로서 처리될 때 # 한 콜에 대한 분석시에만 sentence 분리가 의미가 있다. # 여러 콜. 즉, 1000콜 정도의 데이터 분석이면 콜 전체가 하나의 sentence로서 처리되면 된다. from nltk.tokenize import sent_tokenize from time import time tagger = Hannanum() tag_filter = ['NC', 'NQ', 'NB', 'NN', 'NP', 'PV', 'F'] # tagging and tokenizer def _tokenizer(text): tokens = [ '/'.join(tagged_word) for tagged_word in tagger.pos(text, ntags=22) if tagged_word[1] in tag_filter ] return tokens sents = [] # document 전체를 담을 변수 for i in range(10): filename = "../data/news_data/news" + str(i) + ".txt" f = open(filename, "r")
def __init__(self) -> None: super().__init__() self.hannanum = Hannanum()
def root(): url = 'http://chatsvr.run.goorm.io/message/@all' response = urllib.request.urlopen(url).read().decode('utf-8') json_return = json.loads(response) data = DF(json_return) li = ' '.join(data['text']) li = li.split() file = open("static/stopwords.txt") stop = file.readlines() for i in range(len(stop)): stop[i] = stop[i].strip() result = [] for i in li: if i not in stop: result.append(i) li2 = ' '.join(result) li3 = re.sub('[!@#$%^&*()-+[]{}♥:;?/]', '', li2) li3 = re.sub('[ㄱ-ㅎㅏ-ㅣ0-9a-zA-Z]', '', li3) li3 = re.sub('[♥&;#]', '', li3) t = Hannanum() key = t.nouns(li3) for i, v in enumerate(key): if len(v) < 2: key.pop(i) count = Counter(key) tags = count.most_common(50) #num = Counter(key) font_path = 'static/NotoSans-Black.otf' wordcloud = WordCloud(font_path=font_path, background_color="white", width=800, height=800).generate_from_frequencies(dict(tags)) fig = plt.figure(figsize=(6, 6)) plt.imshow(wordcloud) plt.axis('off') fig.savefig('static/wordcloud.png') #fig2 = plt.figure(figsize=(6,6)) data2 = DF(tags) data2.columns = ['sym', 'freq'] trace1 = go.Bar(x=data2['sym'], y=data2['freq']) dataset = [trace1] layout = go.Layout() fig2 = go.Figure(data=dataset, layout=layout) #fig2.wirte_image('static/plot2.png') #po.write_html(fig2, file="static/plot.html") #fig2 = pyo.iplot(fig2) #po.write_image(fig2, file='static/plot2.png') #plot=Image(po.to_image(fig2, format="png"), width=576, height=576) fig2.write_image("static/plot.png", width=800, height=800) return render_template('wordcloud.html', time=time.time())
for tag in com_date_info: a = tag.select('a') for ext_tag in a: ext_tag.extract() span = tag.select('span') for ext_tag in span: ext_tag.extract() date_info.append(tag.getText().strip()) source = ar.select_one("dl > dd:nth-of-type(2)").text source = source.replace(",", "") #print(title.text.replace(",", "") +'\n' +com_info +" / "+ date_info[0] +'\n' +title.attrs['href']+ "\n"+ ":" + source + '\n') han1 += (title.text.replace(",", "") + '\n' + com_info + " / " + date_info[0] + '\n' + title.attrs['href'] + "\n" + ":" + source + '\n') han = Hannanum() text = han.nouns(han1) text = " ".join(text) mask = np.array(Image.open('./data/back.png')) wordcloud = WordCloud( width=6000, height=3000, font_path='/usr/share/fonts/truetype/nanum/NanumGothic_Coding.ttf', max_font_size=100, background_color='white', mask=mask).generate(text) fig = plt.figure(figsize=(60, 30)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.savefig('cloud.png')
def token(request): try: with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc1.txt', 'r', encoding='UTF-8') as r: doc1 = r.readlines() with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc2.txt', 'r', encoding='UTF-8') as r: doc2 = r.readlines() with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc3.txt', 'r', encoding='UTF-8') as r: doc3 = r.readlines() hangul = re.compile('[\u3131-\u3163\uac00-\ud7a3]+') doc1_ko = hangul.findall(doc1[0]) #한글 추출 doc2_ko = hangul.findall(doc2[0]) doc3_ko = hangul.findall(doc3[0]) str1 = '' # list -> str str2 = '' str3 = '' for x in doc1_ko: str1 += x + ' ' for x in doc2_ko: str2 += x + ' ' for x in doc3_ko: str3 += x + ' ' hannanum = Hannanum() #알고리름 설정 doc1_ko_nouns_list = hannanum.nouns(str1) doc2_ko_nouns_list = hannanum.nouns(str2) doc3_ko_nouns_list = hannanum.nouns(str3) with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/stop_word_ko.txt', 'r', encoding='UTF-8') as r: stopword_ko = r.readlines() stwd = [] for x in stopword_ko: #\n 제거 stwd.append(x.replace('\n', '')) for x in doc1_ko_nouns_list: #한글 불용어 제거 if x in stwd: doc1_ko_nouns_list.remove(x) for x in doc2_ko_nouns_list: #한글 불용어 제거 if x in stwd: doc2_ko_nouns_list.remove(x) for x in doc3_ko_nouns_list: #한글 불용어 제거 if x in stwd: doc3_ko_nouns_list.remove(x) with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/doc1_ko.txt', 'w', encoding='UTF-8') as w: for x in doc1_ko_nouns_list: # 한글 nouns 저장 if x == '.' or x == ',': continue w.write(x + ' ') with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/doc2_ko.txt', 'w', encoding='UTF-8') as w: for x in doc2_ko_nouns_list: # 한글 nouns 저장 if x == '.' or x == ',': continue w.write(x + ' ') with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/doc3_ko.txt', 'w', encoding='UTF-8') as w: for x in doc3_ko_nouns_list: # 한글 nouns 저장 if x == '.' or x == ',': continue w.write(x + ' ') doc1_en = hangul.sub('', doc1[0]) # 영어 분석하기전에 한글 제거 doc2_en = hangul.sub('', doc2[0]) doc3_en = hangul.sub('', doc3[0]) token_doc1 = word_tokenize(doc1_en) # 영어만 토큰화 진행 token_doc2 = word_tokenize(doc2_en) token_doc3 = word_tokenize(doc3_en) if len( doc1 ) == 1: #readlines로 파일을 읽었을 때 len가 1일때만 token한 파일을 write 하기 위한 조건문 with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc1.txt', 'a', encoding='UTF-8') as w: w.write('\n') for x in token_doc1: if x == '.' or x == ',': #온점, 반점 제거 continue w.write(x.lower() + ' ') #문자를 소문자로 전환 .lower() with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc2.txt', 'a', encoding='UTF-8') as w: w.write('\n') for x in token_doc2: if x == '.' or x == ',': continue w.write(x.lower() + ' ') with open( 'C:\\Users\osk\DjangoProject\info_search\oper\data/datadoc3.txt', 'a', encoding='UTF-8') as w: w.write('\n') for x in token_doc3: if x == '.' or x == ',': continue w.write(x.lower() + ' ') return render( request, 'oper/processing.html', { 'process': '토큰화', 'token_doc1': token_doc1, 'token_doc2': token_doc2, 'token_doc3': token_doc3 }) except FileNotFoundError: return redirect('/home')
from wordcloud import WordCloud from collections import Counter from re import match import matplotlib.pyplot as plt import numpy as np import pandas as pd from konlpy.tag import Hannanum data = pd.read_csv("/Users/doyun/2020_text_mining/jobkorea_data.csv") hannanum = Hannanum() ap_progra = data.loc[data['직무분야'] == "ERP·시스템분석·설계", "답변"] nouns = hannanum.nouns(''.join(str(ap_progra.fillna('')))) nouns = [n for n in nouns if len(n) > 1] nouns = [n for n in nouns if not (match('^[0-9]', n))] count = Counter(nouns) top = count.most_common(40) my_font_path = '/Users/doyun/Library/Fonts/NanumBarunGothic.ttf' wordcloud = WordCloud(font_path=my_font_path, background_color='white', width=800, height=600) cloud = wordcloud.generate_from_frequencies(dict(top)) plt.figure(figsize=(10, 8)) plt.axis('off') plt.imshow(구름)
def Testhansuk(): #data = " return data def TestHong(str): # if str == "" # data = "mclab.hufs.ac.kr" # else str =="google"테스트 # data = "http://www.google.com" return str ######################################################################################### from konlpy.tag import Hannanum mecab = Hannanum() sen = '인스타그램에서 야식음식에서 간장을 제외하고 후라이드는 해쉬태그' #print(mecab.pos(sen)) def first(sen): sub = dict(sen) li = [] for key, value in sub.items(): if value == 'N': sub.items() li.append(key) elif value == 'F': li.append(key) reject(li)
import time start = time.time() from konlpy.tag import Hannanum doc = input("문장을 입력해주세요 : ") ## 문장을 입력받는다 n = input("입력문장과 유사한 몇개의 문장을 출력할까요? : ") ## 유사한 문장을 몇개 출력할지 입력 받는다 print(" ") hannanum = Hannanum() ##형태소 분석을 위하여 konlpy의 Hannanum class를 이용하여 준다 doc_tokenized = hannanum.morphs(doc) ##입력문장을 형태소 단위로 쪼개준다 doc_tokenized_size = len(doc_tokenized) ##입력문장의 형태소 개수 list = [] with open('KCCq28.txt', 'r', encoding='utf-8') as input: # 유사도 검사할 말뭉치를 가져옴 for line in input: ##말뭉치에서 라인 단위로 읽어준다 file_tokenized = hannanum.morphs(line) ##읽어들인 라인을 형태소 단위로 쪼개준다 file_tokenized_size = len(file_tokenized) ##읽어들인 라인의 형태소 개수 intersection_size = 0 ##교집합 형태소 개수를 초기화 for x in doc_tokenized: if x in file_tokenized: intersection_size += 1 ##입력문장과 라인문장을 비교하여 겹치는 형태소가 있다면 교집합 형태소 개수 증가시키기 if len(doc) <= len(line): short = doc_tokenized_size else: short = file_tokenized_size ## 입력문장과 파일 각 문장의(라인단위) 길이를 비교하여 짧은 문장의 형태소 개수를 short에 저장 similarity = float(intersection_size) / float( short) ## 공통 형태소 개수 / 짧은 문장소 형태소 개수 = 유사도 list.append([line, similarity * 100 ]) ##list 배열에 문장과, 그문장의 입력문장에 대한 유사도를 집어넣어준다 sorted_list = sorted( list,
sentence = u'내년도 최저임금을 기존 방식대로 전체 업종에 동일하게 적용하기로 결정했다.\ 최저임금의 업종별 차등 적용을 요구해온 사용자위원들은 이에 반발해 전원회의에서 퇴장했다.\ 최저임금위원회 사용자위원들은 이날 오후 정부세종청사에서 열린 최저임금위원회 제5차 전원회의 도중 퇴장해 기자들과 만나 \ "금일 최저임금위원회는 최저임금 고시에 월 환산액을 병기하고 2020년 최저임금을 모든 업종에 동일하게 적용하기로 결정했다"고 밝혔다.' sentences = [sentence] * 10000 import time from konlpy.tag import Hannanum, Kkma, Komoran, Okt, Mecab from khaiii import KhaiiiApi api = KhaiiiApi() morphs_processors= [('Hannanum', Hannanum()), ('Kkma', Kkma()), ('Komoran', Komoran()), ('Okt', Okt()), ('mecab', Mecab())] for name, morphs_processor in morphs_processors: strat_time = time.time() morphs = [morphs_processor.pos(sentence) for sentence in sentences] elapsed_time = time.time() - strat_time print('morphs_processor name = %20s, %.5f secs' % (name, elapsed_time)) strat_time = time.time() morphs = [api.analyze(sentence) for sentence in sentences] elapsed_time = time.time() - strat_time print('morphs_processor name = %20s, %.5f secs' % ('khaiii', elapsed_time))