def view_post(request, pk): the_post = get_object_or_404(Post, pk=pk) the_comment = Comment.objects.filter(post=the_post) mecab = Mecab() morph = mecab.pos(the_post.content) the_morph = ' '.join(str(e) for e in morph) if request.method == 'GET': pass elif request.method =='POST': new_comment = Comment() new_comment.content = request.POST.get('content') new_comment.post = the_post new_comment.save() return render(request, 'view_post.html',{ 'post' : the_post, 'comments' : the_comment, 'morph' : the_morph, })
def pre_process(self, json, istrain): mecab = Mecab() data = [] for cnt, article in enumerate(json): if cnt % 10000 == 0: print(cnt) text = bs(article["text"], "html.parser").text #title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])] #author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])] text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)] data.append({ #"title_pos": title_pos, #"title_pos_sentences" : " ".join(title_pos), #"author_pos": author_pos, #"author_pos_sentences" : " ".join(author_pos), "text":article["text"], "text_pos": text_pos, "text_pos_sentences" : " ".join(text_pos), #"forumid": article["forumid"], "pk": article["pk"] }) if istrain == True: data[cnt]["istroll"] = article["is_troll"] data = pd.DataFrame.from_dict(data) data = data.set_index('pk') return data
def main(): mecab = Mecab() if len(sys.argv) < 2: result = {'result':'none'} print json.dumps(result) sys.exit(0) morphem_list = mecab.pos(sys.argv[1].decode('utf-8')) result_dict = {} result_dict['result'] = [x[0].encode('utf-8') for x in morphem_list] print json.dumps(result_dict)
def _mecab_parse(self, str_arr, tag_combine=True): """ :param h5file: :return: """ mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') return_arr = [] for data in str_arr: return_arr = return_arr + self._flat(mecab.pos(str(data)), tag_combine=tag_combine) return return_arr
def _pos_raw_data(self, lt): """ :param lt: list type value :return: """ mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') return_arr= [] for raw in lt : pos = mecab.pos(raw) for word, tag in pos: return_arr.append("{0}/{1}".format(word, tag)) return return_arr
def _pos_tag_predict_data(self, x_input, word_len): """ :param x_input: :return: """ word_list = [] mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') for word_tuple in self._pad_predict_input(mecab.pos(x_input), word_len): if (len(word_tuple[1]) > 0): word = ''.join([word_tuple[0], "/", word_tuple[1]]) else: word = word_tuple[0] word_list.append(word) return word_list
def _conv_type_b(self, idx): """ :return: """ df_csv_read = pd.read_csv(self.pattern_data_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') i = 0 for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) : words = [] if (self.use_mecab): self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = self.mecab.pos(line) for word, tag in pos: words.append(word) else: words = str(line).split(' ') match_keys = self._check_all_match(words) aug_data = self._aug_sent(match_keys, words, []) self._intent_formatter(aug_data, key, idx) if(i%100 == 0) : print("====Therad{0} : {1} line job done".format(idx, i)) i = i + 1
def __init__( self ): self.driver = webdriver.Firefox() self.classifier = cf.classifier() self.URLs = [] self.contexts = [] self.bag = utils.load_dictionary() self.tagger = Mecab()
def __init__(self): # initalize Mecab tagger self.tagger = Mecab() # initalize regular expression self.exp = re.compile(self.POS, re.IGNORECASE) # load sentiment dictionary self.bag = utils.load_dictionary() # load model if exist with open("../Resources/models/model", "rb") as model_file: self.model = pickle.load(model_file)
def learning(request, pk): the_post = get_object_or_404(Post, pk=pk) mecab = Mecab() morph = mecab.pos(the_post.content) if request.method=="GET": pass elif request.method=="POST" and the_post.sentiword_set.exists()==False: for m in range(len(morph)): the_word = Sentiword() the_word.word = str(morph[m]) the_word.post = the_post the_post.senti = request.POST.get('senti') the_post.save() the_word.save() return redirect('view_post', pk=pk) else: return redirect('view_post', pk=pk) return render(request, 'learning.html',{ 'post':the_post, })
def parse(self, data_path = "data"): file_list = glob.glob("%s/*.json" % data_path) json_list=[] shuffle(file_list) for json_file_name in file_list: json_file = json.loads(open(json_file_name).read()) json_list += json_file["articles"] mecab = Mecab() dataframe = [] for article in json_list: text = bs(article["text"], "html.parser").text title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])] author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])] text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)] dataframe.append({ "title_pos": title_pos, "title_pos_sentences" : " ".join(title_pos), "author_pos": author_pos, "author_pos_sentences" : " ".join(author_pos), "text":article["text"], "text_pos": text_pos, "text_pos_sentences" : " ".join(text_pos), "forumid": article["forumid"], "istroll": article["is_troll"], "pk": article["pk"] }) dataframe = pd.DataFrame.from_dict(dataframe) dataframe = dataframe.set_index("pk") return dataframe
def __init__( self, date, news_limit = 5, net_limit = 50 ): self.section = util.load_file("section.txt") self.date = date self.news_limit = news_limit self.net_limit = net_limit self.refer = 0 self.mecab = Mecab() self.exp = re.compile("NN|XR|VA|VV|MAG|VX") self.temp_net = {} self.temp_list = {} self.word_net = [] # relative word and its frequency self.word_list = [] # total word and its frequency (using for PMI) self.news = [] # top # of news self.sentiment = [0, 0] # [neg, pos] self.counter = [ 0 for i in range(16) ]
def _conv_type_a(self, idx): """ :return: """ df_csv_read = pd.read_csv(self.pattern_data_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') i = 0 for line in df_csv_read['encode'].values: words = [] if(self.use_mecab) : self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = self.mecab.pos(line) for word, tag in pos: words.append(word) else : words = str(line).split(' ') match_keys = self._check_all_match(words) if(self.out_format_type == 'plain') : aug_data = self._aug_sent(match_keys, words, []) self._plain_formatter(aug_data,idx) elif(self.out_format_type == 'iob') : aug_data = self._aug_sent(match_keys, words, []) self._iob_formatter(aug_data,idx) else : raise Exception (' '.join(['not', 'plain', 'or iob'])) if (i % 100 == 0): print("====Therad{0} : {1} line job done".format(idx, i)) i = i + 1 # da = DataAugmentation({ # "use_mecab": True, # "max_file_size": 100000000, # "pattern_data_path": "/hoya_model_root/aug/pattern.csv", # "augmented_out_path": "/hoya_model_root/aug/aug_0810/", # "dict_path": "/hoya_model_root/aug/dict.csv", # "out_format_type": "iob", # "dict_sample_size" : 3, # "dict_sample_iter" : 500, # "thread_num" : 8 # }) # da.run()
class SearchCluster: def __init__(self, app): self.app = app self.mecab = Mecab() self.load_models() def load_models(self): self.word2vec = gensim.models.Word2Vec.load_word2vec_format(WORD2VEC_MODEL, binary=True) self.cluster_pipe = joblib.load(PIPE_DUMPING) def __task_to_vector(self, task): words = [key for key, pos in self.mecab.pos(task)] # aggregation word vectors vector = np.mean(np.array([self.word2vec[word] for word in words if word in self.word2vec]), axis=0) return vector def __predict_label(self, task): vector = self.__task_to_vector(task) return self.cluster_pipe.predict(vector)[0] def get_articles(self, user_id, task, topn=3): label = self.__predict_label(task) article_id_list = list(self.app.query_pool2.get_same_cluster_articles(user_id, label, topn)) return list(self.app.query_pool2.get_article_list_by_id(article_id_list))
import re from konlpy.tag import Mecab from typing import List split_morphs = Mecab().morphs def split_jamos(string: str) -> List[str]: # 유니코드 한글 시작 : 44032, 끝 : 55199 _base_code = 44032 _chosung = 588 _jungsung = 28 # 초성 리스트. 00 ~ 18 _chosung_list = [ 'ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] # 중성 리스트. 00 ~ 20 _jungsung_list = [ 'ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ' ] # 종성 리스트. 00 ~ 27 + 1(1개 없음) _jongsung_list = [ ' ', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ' ] def split(sequence): split_string = list(sequence) list_of_tokens = []
if __name__ == "__main__": data = [] label = [] count = {} conn = pymysql.connect(host='192.168.1.10', user='******', password='******', charset='utf8', db='crolls') cursor = conn.cursor() sql = "SELECT title, content, etc FROM data_set2" cursor.execute(sql) res = cursor.fetchall() mecab = Mecab() for one in tqdm(res): article = get_vector(one[0]) article += get_vector(one[1]) word_count = len(article) if word_count > 40: data.append(article[0:40]) elif word_count < 40: for i in range(0, 40 - word_count): article.append([0 for j in range(0, 16)]) data.append(article) else: data.append(article) if one[2].split(',')[0] == "장애인": label.append([1, 0, 0, 0, 0])
# -*- coding: utf-8 -*- import csv from konlpy.tag import Mecab import gensim from collections import namedtuple import time j=1 tmp_list=[] doc_list=[] main_str = str(0) words = str(0) csv_file = "C:/Users/int_sub05/.spyder-py3/sample/2017_01_0{}.csv" csv_file2 = "C:/Users/int_sub05/.spyder-py3/sample/2017_01_{}.csv" mecab = Mecab(dicpath="C:\mecab\mecab-ko-dic") doc_vectorizer = gensim.models.Doc2Vec( dm=0, dbow_words=1, window=8, size=300, alpha=0.025, seed=1234, min_count=20, min_alpha=0.025, hs=1, negative=10) for i in range(1,31): if i<=9: f = open(csv_file.format(i), 'r', encoding='utf-8') rdr = csv.reader(f)
class stopwordFilter: def __init__(self, myDB): self.stopword = set() self.myDB = myDB self.tagger = Mecab(dicpath=r"C:\mecab\mecab-ko-dic") # self.typoList = list() # self.initTypoChanger() # 불용어가 잘 처리되는지 확인하기 위해 DB의 재료를 ingredient.txt 로 받은 뒤 # 불용어 처리한 재료를 ingredientListElimStopword.txt 에 다시 써서 제대로 가공됬는지 확인한다 # (이 처리가 잘됨을 확인하면 그때 DB의 자료를 실제로 update 할것) def eliminateStopwordFromIngredient(self): self.initStopword() # self.makeIngredientToText() rf = open('textFile/ingredientList.txt', mode='rt', encoding='utf-8') wf = open('textFile/ingredientListElimStopword.txt', mode='wt', encoding='utf-8') for line in rf: writeStr = self.linePreprocess2(line) if writeStr != str(): writeStr = writeStr.lstrip(' ') + '\n' wf.write(writeStr) if not line: break def initStopword(self): self.deDuplicationStopword() f = open('textFile/stopwordList.txt', mode='rt', encoding='utf-8') for line in f: self.stopword.add(line.rstrip('\n')) if not line: break f.close() def linePreprocess(self, line): line = re.sub(pattern=patternBlank, repl='', string=line) line = re.sub(pattern=patternSymbol, repl='', string=line) line = line.rstrip('\n') ingredientArr = line.split(' ') writeStr = str() for ingredient in ingredientArr: if ingredient not in self.stopword: # writeStr += (' ' + ingredient) writeStr += (ingredient) return writeStr def linePreprocess2(self, line): line = re.sub(pattern=patternBlank, repl='', string=line) line = re.sub(pattern=patternOR, repl='', string=line) nouns = self.tagger.nouns(line) writeStr = str() for noun in nouns: if noun not in self.stopword: # writeStr += (' ' + ingredient) writeStr += noun return writeStr def makeIngredientToText(self): ingredientList = self.myDB.select_ingredient_iname() f = open('textFile/ingredientList.txt', mode='wt', encoding='utf-8') for ingredient in ingredientList: f.write(ingredient['iname'] + '\n') f.close() def deDuplicationStopword(self): f = open('textFile/stopwordList.txt', mode='rt', encoding='utf-8') mySet = set() for line in f: mySet.add(line.rstrip('\n')) if not line: break f.close() f = open('textFile/stopwordList.txt', mode='wt', encoding='utf-8') for ingredient in mySet: f.write(ingredient + '\n') f.close() def morphemeAnalysis(self, line): return list(self.tagger.morphs(line)) # print(self.tagger.nouns(line)) # print(self.tagger.pos(line)) def initTypoChanger(self): self.typoList.append({ 'typos': ["머스타드", "머스터드", '허니머스트', '머스타트'], 'except': [], 'wrong': '머스타드' }) self.typoList.append({'typos': ["양파"], 'except': [], 'wrong': '양파'}) self.typoList.append({'typos': ["카레"], 'except': [], 'wrong': '카레'}) self.typoList.append({ 'typos': ["쌀국수"], 'except': ['소스', '스톡'], 'wrong': '쌀국수' }) self.typoList.append({ 'typos': ["파프리카"], 'except': [], 'wrong': '파프리카' }) self.typoList.append({'typos': ["베이컨"], 'except': [], 'wrong': '베이컨'}) self.typoList.append({'typos': ["베이컨"], 'except': [], 'wrong': '베이컨'}) self.typoList.append({'typos': ["우동면"], 'except': [], 'wrong': '우동면'}) self.typoList.append({'typos': ["오트밀"], 'except': [], 'wrong': '오트밀'}) self.typoList.append({ 'typos': ["케찹", '케첩', '캐찹', '캐첩'], 'except': [], 'wrong': '케첩' }) self.typoList.append({ 'typos': ["소시지", "소세지"], 'except': [], 'wrong': '소세지' }) self.typoList.append({'typos': ["경기미"], 'except': [], 'wrong': '백미'}) self.typoList.append({'typos': ["액젓"], 'except': [], 'wrong': '액젓'}) self.typoList.append({ 'typos': ["후추", "후춧"], 'except': [], 'wrong': '후추' }) self.typoList.append({'typos': ["식초"], 'except': [], 'wrong': '식초'}) self.typoList.append({ 'typos': ["칼국수"], 'except': ['스프'], 'wrong': '칼국수' }) self.typoList.append({'typos': ["지단"], 'except': [], 'wrong': '지단'}) self.typoList.append({ 'typos': ["어묵", '오뎅'], 'except': ['어묵'], 'wrong': '어묵' }) self.typoList.append({ 'typos': ['와사비'], 'except': ['마요'], 'wrong': '와사비' }) self.typoList.append({ 'typos': ['후리카케', '후리가깨', '후리가캐', '후리가께', '후리가'], 'except': [], 'wrong': '후리카케' }) self.typoList.append({ 'typos': ['파슬리', '파아슬리'], 'except': [], 'wrong': '파슬리' }) def typoChanger(self, line): for typo in self.typoList: aFlag = False tFlag = False for e in typo['except']: if line.find(e) != -1: aFlag = True for t in typo['typos']: if line.find(t) != -1: tFlag = True if aFlag is False and tFlag is True: return typo['wrong'] return line
! pip install mecab_python-0.996_ko_0.9.2_msvc-cp36-cp36m-win_amd64.whl import MeCab m = MeCab.Tagger() OUTPUT = m.parse('Mecab 설치를 확인합니다.') print(OUTPUT) !pip install JPype1-1.0.2-cp36-cp36m-win_amd64.whl ! pip install konlpy from konlpy.tag import Kkma K = Kkma() out = K.nouns('코엔엘파이 설치를 확인합니다') print(out) from konlpy.tag import Mecab m = Mecab() m.nouns('메켑이 설치되었는지 확인')
class keyword_anaylze(): def __init__( self, date, news_limit = 5, net_limit = 50 ): self.section = util.load_file("section.txt") self.date = date self.news_limit = news_limit self.net_limit = net_limit self.refer = 0 self.mecab = Mecab() self.exp = re.compile("NN|XR|VA|VV|MAG|VX") self.temp_net = {} self.temp_list = {} self.word_net = [] # relative word and its frequency self.word_list = [] # total word and its frequency (using for PMI) self.news = [] # top # of news self.sentiment = [0, 0] # [neg, pos] self.counter = [ 0 for i in range(16) ] def _add_news( self, context, url, title ): if len(self.news) < self.news_limit: self.news.append([len(context), url, title]) self.news.sort() else: self.news[0] = [len(context), url, title] self.news.sort() def _add_word( self, words, word_list, senti ): for w in words: if len(w) < 2: continue if w in word_list: word_list[w][0] += 1 word_list[w][int(senti)+1] += 1 else: word_list[w] = [1, 0, 0] word_list[w][int(senti)+1] += 1 def _make_morp( self, context ): context = re.sub(r"(\"|\')", "", context) words = re.findall(r"[\w']+", context) for i, v in enumerate(words): pos = self.mecab.pos(v) w = [ p[0] for p in pos if not re.search("NN|XR|VA|VV|MAG|VX|SL|SN", p[1]) ] for x in w: words[i] = words[i].replace(x, "") # remove '' in words return [ w for w in words if not w == "" ] def _arrange_word_list( self, dictionary ): words = sorted(dictionary.items(), key=itemgetter(1), reverse=True) word_list = [] for w in words: pos = self.mecab.pos(w[0]) if re.search("NN|XR", pos[0][1]): word_list.append(w) return word_list def _traverse_news( self, keyword ): global news_loc keyword_list = keyword.split(" ") for s in self.section: idx = 0 loc = news_loc+self.date+"/"+s print(loc+"/") while os.path.isfile(loc+"/"+str(idx)): f = open(loc+"/"+str(idx), "r") senti = f.readline().replace("\n", "") url = f.readline().replace("\n", "") title = f.readline().replace("\n", "") context = f.read().replace("\n", "") words = self._make_morp(context) f.close() self._add_word(words, self.temp_list, senti) is_key = True for key in keyword_list: have_word = False for w in words: if key in w: have_word = True if not have_word: is_key = False if is_key: self.counter[0+int(senti)] += 1 self.refer += 1 self.sentiment[int(senti)] += 1 self._add_news(context, url, title) self._add_word(words, self.temp_net, senti) idx += 1 def _traverse_community( self, keyword ): global community_loc base_loc = community_loc+keyword+"/" idx = 0 print(base_loc) while True: loc = base_loc+str(idx) idx += 1 if not os.path.isfile(loc): break f = open(loc, "r") senti = f.readline().replace("\n", "") comm = f.readline().replace("\n", "") title = f.readline().replace("\n", "") context = f.read().replace("\n", "") words = self._make_morp(context) f.close() self.sentiment[int(senti)] += 1 self._add_word(words, self.temp_list, senti) self._add_word(words, self.temp_net, senti) # determine community if comm == "dcinside": self.counter[2+int(senti)] += 1 elif comm == "todayhumor": self.counter[4+int(senti)] += 1 elif comm == "twitter": self.counter[6+int(senti)] += 1 elif comm == "fomos": self.counter[8+int(senti)] += 1 elif comm == "inven": self.counter[10+int(senti)] += 1 elif comm == "instiz": self.counter[12+int(senti)] += 1 elif comm == "ppomppu": self.counter[14+int(senti)] += 1 def _make_word_net( self ): network = [] words = [] count = [] for v in self.word_net: words.append(v[0]) count.append(v[1][0]) for i, v in enumerate(self.word_list): for j, w in enumerate(words): if v[0] == w and v[1][0] > 10: senti = v[1][2] / v[1][0] pmi = count[j] / v[1][0] network.append([w, senti, v[1][0], pmi]) return network def anaylze( self, keyword ): self._traverse_news(keyword) self._traverse_community(keyword) # sort word_net self.word_net = self._arrange_word_list(self.temp_net) if len(self.word_net) > self.net_limit: self.word_net = [ self.word_net[i] for i in range(self.net_limit) ] # sort word_list self.word_list = self._arrange_word_list(self.temp_list) # network = [ [word, senti, frequency, PMI] .. ] network = self._make_word_net() return self.sentiment, self.news, network, self.counter
import pandas as pd import numpy as np import tensorflow as tf from bs4 import BeautifulSoup import ipdb from konlpy.tag import Mecab from gensim.models import Word2Vec mecab = Mecab() learning_rate = 0.001 dim_embed = 200 n_epochs = 20 window_size = 5 min_count = 3 wiki_file = '../text/wiki_all' with open( wiki_file ) as f: wiki_contents = f.read() wiki_docs = map(lambda x: filter(lambda y: y != '', x.text.split('\n')), BeautifulSoup( wiki_contents ).find_all('doc')) wiki_paragraphs = [item for sublist in wiki_docs for item in sublist] paragraph_list = [] for wiki_paragraph in wiki_paragraphs: wiki_paragraph_pos = map(lambda x: x[0] + '^/'+ x[1], mecab.pos( wiki_paragraph )) if len(wiki_paragraph_pos) > 2: paragraph_list.append( wiki_paragraph_pos ) del wiki_paragraphs
class crawl_community(): def __init__( self ): self.driver = webdriver.Firefox() self.classifier = cf.classifier() self.URLs = [] self.contexts = [] self.bag = utils.load_dictionary() self.tagger = Mecab() def __del__( self ): self.driver.quit() def _crawl_URL( self ): titles = [] # dynamic scrolling more_count = 0 while True: time.sleep(0.5) more = self.driver.find_element_by_id("real_more_page") if more.is_displayed(): if more.text == "더보기": more.click() more_count += 1 else: break else: self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") if more_count >= self.scroll: break # get html source html = self.driver.page_source soup = BeautifulSoup(html) # crawl URL for c in soup.find_all("li"): # if items are from community if c.get("class") == ['realtimeitem', 'community']: href = c.find("a")["href"] self.URLs.append(href) title = c.find("a").get_text().strip() titles.append(title) # if items are from twitter elif c.get("class") == ['realtimeitem', 'twitter']: for s in c.find_all("span"): if s.get("class") == ['text', 'snsbody']: href = s['href'] self.URLs.append(href) titles.append("twitter") return titles def _exclude_short( self, text ): pos = self.tagger.pos(text) words = [ p[0] for p in pos ] is_in = False for b in self.bag[0]: if b[0] in words: is_in = True for b in self.bag[1]: if b[0] in words: is_in = True return not is_in def _crawl_dcinside( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["s_write"]: text = c.find_all("td")[0].get_text() text = text.strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["dcinside", title, text]) """ def _crawl_mlbpark( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("td"): if c.get("class") == ["G13"] and c.find_all("div"): div = c.find_all("div")[0] text = div.get_text() text = text.strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["mlbpark", title, text]) break """ def _crawl_twitter( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("p"): tag = c.get("class") if tag and "tweet-text" in tag: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude : self.contexts.append(["twitter", title, text]) def _crawl_todayhumor( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["viewContent"]: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["todayhumor", title, text]) """ def _crawl_clien( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) c = soup.find(id="writeContents") if c: text = c.get_text().strip().replace("\n", " ") if self._exclude_short: self.contexts.append(["clien", title, text]) def _crawl_bobaedream( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["bodyCont"]: text = c.get_text().strip().replace("\n", " ") if self._exclude_short: self.contexts.append(["bobaedream", title, text]) """ def _crawl_fomos( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["view_text"]: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["fomos", title, text]) break def _crawl_inven( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("div"): if c.get("class") == ["powerbbsContent"]: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["inven", title, text]) def _crawl_instiz( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) c = soup.find(id="memo_content_1") if c: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["instiz", title, text]) def _crawl_ppomppu( self, url, title ): ret = requests.get(url) soup = BeautifulSoup(ret.text) for c in soup.find_all("td"): if c.get("class") == ["han"]: text = c.get_text().strip().replace("\n", " ") exclude = self._exclude_short(text) if not exclude: self.contexts.append(["ppomppu", title, text]) # determine which URL comes from def _crawl_context( self, titles ): for i, url in enumerate(self.URLs): if "dcinside" in url: self._crawl_dcinside(url, titles[i]) #elif "mlbpark" in url: self._crawl_mlbpark(url, titles[i]) elif "todayhumor" in url: self._crawl_todayhumor(url, titles[i]) #elif "clien" in url: self._crawl_clien(url, titles[i]) elif "twitter" in url: self._crawl_twitter(url, titles[i]) #elif "bobaedream" in url: self._crawl_bobaedream(url, titles[i]) elif "fomos" in url: self._crawl_fomos(url, titles[i]) elif "inven" in url: self._crawl_inven(url, titles[i]) elif "instiz" in url: self._crawl_instiz(url, titles[i]) elif "ppomppu" in url: self._crawl_ppomppu(url, titles[i]) else: print(url) # classify sentiment for i, v in enumerate(self.contexts): vector = self.classifier.features(v[1]+v[2]) predict = self.classifier.predict(vector).tolist()[0] self.contexts[i].insert(0, predict) def crawl( self, query, scroll = 5 ): self.scroll = scroll self.query = query self.url = "http://search.zum.com/search.zum?method=realtime&option=accu&query="+query+"&cm=more" self.driver.get(self.url) titles = self._crawl_URL() self._crawl_context(titles) return self.contexts
class DataAugmentation : """ Data Augmentation Class for nlp mainly for create iob data with pattern and dict test = DataAugmentation() test.load_dict() test.convert_data() """ class ThreadCls(threading.Thread) : def __init__(self, obj, idx): threading.Thread.__init__(self) self.obj = obj self.idx = idx def run(self): for _ in range(self.obj.dict_sample_iter): self.obj.load_dict() self.obj.convert_data(self.idx) def join(self): threading.Thread.join(self) return True def __init__(self, conf): """ init parms need to mange teses parms on db """ self.aug_file_cnt = 0 self.use_mecab = conf.get("use_mecab") self.max_file_size = conf.get("max_file_size") #10M self.pattern_data_path = conf.get("pattern_data_path") self.augmented_out_path = conf.get("augmented_out_path") self.dict_path = conf.get("dict_path") self.out_format_type = conf.get("out_format_type") self.ner_dicts = {} self.gpu_use = True self.dict_sample_size = int(conf.get("dict_sample_size")) self.dict_sample_iter = int(conf.get("dict_sample_iter")) self.thread_num = int(conf.get("thread_num")) def run(self): """ run :return: """ job_list = [] for idx, _ in enumerate(range(self.thread_num)) : job_list.append(self.ThreadCls(self, idx)) for job in job_list: job.start() for job in job_list: job.join() def load_dict(self): """ load dict list from csv file :return: """ self.ner_dicts = {} df_csv_read = pd.read_csv(self.dict_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') df_csv_read = df_csv_read.sample(n=self.dict_sample_size) for col in df_csv_read.keys() : self.ner_dicts[col] = [] for val in list(set(df_csv_read[col])) : if (val == val and val != None) : self.ner_dicts[col].append(val) def _check_all_match(self, words) : """ check all matcing dict keys in ohter word entity keys :param words: sentence str :return: list contain keys """ match_keys = [] for word in words : word = word.replace('\n', '') if(word in list(self.ner_dicts.keys())) : match_keys.append(word) return match_keys #@autojit def _aug_sent(self, keys, pattern, return_aug_sent=[]) : """ function which actually augment sentences with given pattern and keys :param keys: entity keys :param pattern: sentence pattern :return: list of augmented sentence """ try : if (len(keys) > 0): key = keys[0] del keys[0] else : return return_aug_sent if (len(return_aug_sent) == 0): for word in self.ner_dicts[key] : line = [] for slot in pattern: for rep in ['\n', 'NaN'] : slot = slot.replace(rep, '') if(key in slot) : for wd in self.mecab.morphs(word): wd = wd.replace(' ', '') line.append((wd, key)) else : line.append((slot, 'O')) return_aug_sent.append(line) else : del_idx = [] for i, line in enumerate(return_aug_sent): for j, slot in enumerate(line): if (slot[0] == key): for word in self.ner_dicts[key]: line = return_aug_sent[i].copy() for z, slot in enumerate(line): if(slot[0] == key) : buffer = "" for wd in self.mecab.morphs(word) : wd = wd.replace(' ', '') if(len(buffer) > 0 ) : buffer = ''.join([buffer,' ', wd]) else : buffer = wd if (len(buffer) > 1 ): line[z] = (buffer, key) return_aug_sent.append(line) del_idx.append(i) for _ in del_idx: del return_aug_sent[0] return self._aug_sent(keys, pattern, return_aug_sent) except Exception as e : print("error on nlp data augmentation :{0}".format(e)) def _iob_formatter(self, aug_data, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test' , str(self.aug_file_cnt) , '.iob']) if(os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size) : with open(path, "a") as f : for line in aug_data : for word in line : related_words = word[0].split(' ') for tocken in related_words : f.write(''.join([tocken, ' ', word[1]])) f.write('\n') f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.iob']) with open(path, "w") as f : for line in aug_data : for word in line : related_words = word[0].split(' ') for tocken in related_words : f.write(''.join([tocken, ' ', word[1]])) f.write('\n') f.write('\n') def _plain_formatter(self, aug_data, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out']) if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size): with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.out']) with open(path, "w") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write('\n') def _intent_formatter(self, aug_data, key, idx) : """ save aug list as iob file format :param aug_data: augmented list of sentence :return: None """ if aug_data == None : pass path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv']) if (os.path.exists(path) == False) : with open(path, "w") as f : f.write('encode,decode\n') if (os.path.exists(path) == False or os.path.getsize(path) < self.max_file_size): with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write(',') f.write(str(key)) f.write('\n') else : self.aug_file_cnt = self.aug_file_cnt + 1 path = ''.join([self.augmented_out_path, '/'+str(idx),'Test', str(self.aug_file_cnt), '.csv']) with open(path, "a") as f : for line in aug_data : for word in line : f.write(''.join([word[0], ' '])) f.write(',') f.write(str(key)) f.write('\n') def convert_data(self, idx) : """ augment data with entity list and pattern :return: Nones """ try : if (self.out_format_type == 'intent'): self._conv_type_b(idx) else : self._conv_type_a(idx) except Exception as e : print("error log : {0}".format(e)) def _conv_type_b(self, idx): """ :return: """ df_csv_read = pd.read_csv(self.pattern_data_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') i = 0 for key, line in zip(df_csv_read['decode'].values, df_csv_read['encode'].values) : words = [] if (self.use_mecab): self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = self.mecab.pos(line) for word, tag in pos: words.append(word) else: words = str(line).split(' ') match_keys = self._check_all_match(words) aug_data = self._aug_sent(match_keys, words, []) self._intent_formatter(aug_data, key, idx) if(i%100 == 0) : print("====Therad{0} : {1} line job done".format(idx, i)) i = i + 1 def _conv_type_a(self, idx): """ :return: """ df_csv_read = pd.read_csv(self.pattern_data_path, skipinitialspace=True, engine="python", encoding='utf-8-sig') i = 0 for line in df_csv_read['encode'].values: words = [] if(self.use_mecab) : self.mecab = Mecab('/usr/local/lib/mecab/dic/mecab-ko-dic') pos = self.mecab.pos(line) for word, tag in pos: words.append(word) else : words = str(line).split(' ') match_keys = self._check_all_match(words) if(self.out_format_type == 'plain') : aug_data = self._aug_sent(match_keys, words, []) self._plain_formatter(aug_data,idx) elif(self.out_format_type == 'iob') : aug_data = self._aug_sent(match_keys, words, []) self._iob_formatter(aug_data,idx) else : raise Exception (' '.join(['not', 'plain', 'or iob'])) if (i % 100 == 0): print("====Therad{0} : {1} line job done".format(idx, i)) i = i + 1 # da = DataAugmentation({ # "use_mecab": True, # "max_file_size": 100000000, # "pattern_data_path": "/hoya_model_root/aug/pattern.csv", # "augmented_out_path": "/hoya_model_root/aug/aug_0810/", # "dict_path": "/hoya_model_root/aug/dict.csv", # "out_format_type": "iob", # "dict_sample_size" : 3, # "dict_sample_iter" : 500, # "thread_num" : 8 # }) # da.run()
class classifier(): # include POS, MAG, VX to handle negation POS = "NN|XR|VA|VV|MAG|VX" POS_IDX = ["NN", "VA", "VV", "XR"] # "못"은 따로 처리 NEG_PREV = [("아니하", "VX"), ("않", "VX"), ("없", "VA"), ("없이", "MAG")] NEG_NEXT = [("안", "MAG")] def __init__(self): # initalize Mecab tagger self.tagger = Mecab() # initalize regular expression self.exp = re.compile(self.POS, re.IGNORECASE) # load sentiment dictionary self.bag = utils.load_dictionary() # load model if exist with open("../Resources/models/model", "rb") as model_file: self.model = pickle.load(model_file) def handle_negation(self, words, counter): # construct index to negate word except "못" neg_idx = [] for neg in self.NEG_PREV: find = utils.find_dup_idx(words, neg) for item in find: if item-1 > -1: neg_idx.append(item-1) for neg in self.NEG_NEXT: find = utils.find_dup_idx(words, neg) for item in find: if item+1 < len(words): neg_idx.append(item+1) # handle "못~" for w in words: loc = w[0].find("못") if loc > 0 and w[1].find("VX"): neg_idx.append(loc-1) # handle "못" for w in words: loc = w[0].find("못") if loc > -1 and w[1].find("MAG"): # 긴 부정문 (못햇다, 못 했다..) if loc > 1 and words[loc-1][1].find("VV"): neg_idx.append(loc-1) # 짧은 부정 elif loc < len(words)-1: neg_idx.append(loc+1) # 한계: 못 생겼다 같은 경우는 이상하게 나옴 # negate word for i in neg_idx: if words[i] in self.bag[0]: try: idx = self.POS_IDX.index(words[i][1]) except ValueError: pass else: counter[idx] -= 1 counter[idx+4] += 1 elif words[i] in self.bag[1]: try: idx = self.POS_IDX.index(words[i][1]) except ValueError: pass else: counter[idx] += 1 counter[idx+4] -= 1 return counter def make_features(self, sentence, words): # feature vector: # [ pos_noun, pos_adj, pos_verb, pos_root, # neg_noun, neg_adj, neg_verb, neg_root ] counter = [0, 0, 0, 0, 0, 0, 0, 0] if not words: return counter for i, w in enumerate(words): # replace POS to sentiment dictionary type words[i] = list(words[i]) if words[i][1].find("NN") >= 0: words[i][1] = "NN" elif words[i][1].find("VA") >= 0: words[i][1] = "VA" elif words[i][1].find("VV") >= 0: words[i][1] = "VV" elif words[i][1].find("XR") >= 0: words[i][1] = "XR" elif words[i][1].find("VX") >= 0: words[i][1] = "VX" elif words[i][1].find("MAG") >= 0: words[i][1] = "MAG" words[i] = tuple(words[i]) # count frequency of sentiment words if words[i] in self.bag[0]: # positive try: idx = self.POS_IDX.index(words[i][1]) counter[idx] += 1 except ValueError: pass elif words[i] in self.bag[1]: # negative try: idx = self.POS_IDX.index(words[i][1]) counter[idx+4] += 1 except ValueError: pass counter = self.handle_negation(words, counter) return counter def features(self, article): # tagging article pos = self.tagger.pos(article) words = [ p for p in pos if self.exp.search(p[1]) ] # construct data sets data = self.make_features(article, words) # normalize features arr = np.array(data, dtype=float) scaled = preprocessing.scale(arr).tolist() data = scaled return data def predict(self, vector): return self.model.predict(vector)
def preprocess(args): """ Description Return - word2idx: Sequence of word index. It is 2-dim like [# of laws, # of words in each law]. - word_dict: Word to index mapping table. { word: idx } (Only contain VOCA_SIZE words) - word_inv_dict: Inverted version of word_dict. { idx: word } (Only contain VOCA_SIZE words) - word_count: Word counter of each laws. Only contain VOCA_SIZE words. """ tagger = Mecab() with open(args.input, "r") as reader: data = reader.read() # Sequence of words in each law. [num_laws, num_words] word_list = list() # Sequence of idx. [num_laws, num_words] word2idx = list() # Mapping table of word - idx. word_dict = dict() # Inversed mapping table of word - idx (for fast access). word_inv_dict = dict() # Word counter. word_count = list() """ Tag part-of-speech and remove unimportant words (like josa..). """ # Split each laws by <END> symbol. law_list = data.split("<END>") for law in law_list: # Eliminate special chars law = re.sub("[^a-zA-Z0-9가-힣 \n]", " ", law) # 1. Eliminate newline, tab and strange char. # 2. Split words by space. word_list.append(law.replace("\n", " ").replace("\t", " ").replace("\xa0" ,"").split(" ")) for i, v in enumerate(word_list): for j, word in enumerate(v): # Tag laws using Mecab tagger. and exclude some tags. tag = tagger.pos(word) excluded = [ t[0] for t in tag if not re.search("NN|XR", t[1]) ] # Exclude word if it contain number (ex. 제1조, 제1항의 경우 해당 단어 삭제). for t in tag: if t[1] == "SN": word_list[i][j] = "" # Reconstruct word_list by using excluded tag list. for e in excluded: word_list[i][j] = word_list[i][j].replace(e, "") word_list[i] = [ w for w in word_list[i] if len(w) > 1 or w == "법" ] # If last element of word_list is empty, remove it. if not word_list[-1]: word_list.pop() # Construct word counter. 1st element in counter is UNKOWN_WORD (simply UNK). word_count.append(["UNK", 0]) merged = list(itertools.chain.from_iterable(word_list)) word_count.extend(collections.Counter(merged).most_common(args.voca_size-1)) # Construct word mapping table. word_dict = { v[0] : i for v, i in zip(word_count, itertools.count(0)) } word_inv_dict = { i : v for v, i in word_dict.items() } # Make sequence of word-idx. for v in word_list: row = list() for word in v: idx = word_dict.get(word) if idx != None: row.append(idx) else: row.append(word_dict.get("UNK")) word_count[0][1] += 1 word2idx.append(row) word_list = None # dont use anymore word_dict = None # dont use anymore word_count = None # dont use anympre return np.array(word2idx), word_inv_dict
num_keysents=5, scaling=lambda x: 1, verbose=True) print(list(keywords.items())[:10]) print('====================') for i, s in enumerate(sents): print(i, s) print('====================') wordrank_extractor = KRWordRank( min_count=3, # 단어의 최소 출현 빈도수 (그래프 생성 시) max_length=20, # 단어의 최대 길이 verbose=True) beta = 0.85 # PageRank의 decaying factor beta max_iter = 10 keywords, rank, graph = wordrank_extractor.extract(text, beta, max_iter, num_keywords=100) vocab_score = make_vocab_score(keywords, stopwords, scaling=lambda x: 1) tokenizer = MaxScoreTokenizer(vocab_score) tokenizer2 = Mecab() sents2 = keysentence(vocab_score, text, tokenizer2.nouns, diversity=0.7, topk=5) for i, s in enumerate(sents2): print(i, s)
train_yy = DataFrame(train['smishing'], columns=['smishing']) train_yyy = train_yy.iloc[train_smishing + train_nsmishing, :].reset_index(drop=True) test[ 'smishing'] = 2 #train data와 동일한 형태 생성을 위해 임의의 숫자를 추가 #이후 스미싱 여부 확률 값으로 덮어 씌워짐 test_xx = DataFrame(test['text']) test_yyy = DataFrame(test['smishing']) train_xx.shape, train_yyy.shape, test_xx.shape, test_yyy.shape # 토크나이즈 단계 import konlpy from konlpy.tag import Mecab tokenizer = Mecab() train_doc = [(tokenizer.nouns(x), y) for x, y in tqdm(zip(train_xx['text'], train_yyy['smishing'])) ] # Mecab를 활용하여 text를 토큰화 시킴 test_doc = [(tokenizer.nouns(x), y) for x, y in tqdm(zip(test_xx['text'], test_yyy['smishing']))] # 불용어처리 단계 stopwords = [ '은행', '광고', '상품', '대출', '사장', '무료', '수신', '거부', '수수료', '안내', '영업부', '년', '정부', '지원', '이자', '상담', '기록', '님', '고객', '고객님', '리브', 'Liiv', '최대', '카톡', '친구', '여신', '금리', '거부', '어플', '다운', '거부' ]
def tokenize(sentence): tagger = Mecab() logger.debug(sentence) s = " ".join(tagger.morphs(sentence)) logger.debug("tokenized:" + s) return s
def __init__(self): super(MecabTokenizer, self).__init__(Mecab())
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import numpy as np import torchtext import nltk from konlpy.tag import Mecab from torchtext.data import Field, BucketIterator, TabularDataset, Dataset import os from cnn_model import CNNClassifier from rnn_model import RNN DATA_PATH = './data' #os.environ['DATA_PATH'] tagger = Mecab() USE_CUDA = torch.cuda.is_available() DEVICE = 'cuda' if USE_CUDA else 'cpu' def pad_under_five(toknized): """ 모델에서 5-gram 단위 필터를 사용하기 때문에 5-gram이 안되는 문장에 <pad>로 채워준다 """ if len(toknized) < 5: toknized.extend(["<pad>"]*(5-len(toknized))) return toknized TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five) LABEL = Field(sequential=False,use_vocab=True,unk_token=None)
# selected_kkma = [] # for sentence1 in kkma_morphs: # for word, tag in sentence1: # if tag in ['Noun','Adjective', 'Verb']: # selected_kkma.append(word) # komoran = Komoran() # kom_morphs = komoran.morphs(lines) # print("komoran: ", kom_morphs) # selected_kom = [] # for sentence1 in kom_morphs: # for word, tag in sentence1: # if tag in ['Noun','Adjective', 'Verb']: # selected_kom.append(word) mecab = Mecab() sentences_tag = [] for sentence in sentences: morph = mecab.pos(sentence) sentences_tag.append(morph) # print("mec: ", mec_morphs) selected_mec = [] n_sentence = 0 nouns_tag = [] for sentence in sentences: morph = mecab.nouns(sentence) nouns_tag.append(morph) # print("mec: ", mec_morphs) for sentence1 in sentences_tag:
'ADJ': counter['A-c'] + counter['A-dp'] + counter['J-c'] + counter['J-tari'] + counter['J-xs'] + counter['R'], 'ADV': counter['F'], 'CC': counter['C'] - len(subordinating_conjunctions), 'CS': len(subordinating_conjunctions), 'ET': counter['E'], 'I': counter['I-c'], 'NC': counter['N-n'] + counter['N-nc'], 'NP': counter['N-pn'], 'PREF': counter['P'], 'PRO': counter['D'], 'V': counter['V-c'] + counter['V-dp'] + counter['X'], 'PUNC': counter['M-aa'] + counter['M-cp'] + counter['M-op'] + counter['M-p'], } mecab_tagger = Mecab() twitter_tagger = Okt() def _analyze_ko(text): mecab_tags = mecab_tagger.pos(text) twitter_tags = twitter_tagger.pos(text) mecab_counter = collections.Counter([x[1] for x in mecab_tags]) twitter_counter = collections.Counter([x[1] for x in twitter_tags]) return { # we need to map the Japanese tagset to a subset of the French tagset, so that we can compare the two 'ADJ': twitter_counter['Adjective'], 'ADV': twitter_counter['Adverb'], 'CC': twitter_counter['Conjunction'], 'CS': mecab_counter['MAJ'], 'ET': twitter_counter['Foreign'], 'I': max(twitter_counter['Exclamation'], mecab_counter['IC']), 'NC': max(0, twitter_counter['Noun'] - mecab_counter['NNP'] - mecab_counter['NP']),
#daumNews = DaumNewsCrawling.DaumNewsCrawling(rowCnt) #daumNews.execute() #daumFinancing = DaumFinacingCrawling.DaumFinacingCrawling() #daumFinancing.execute() testArticle = DBStorage.DBStorage.instance().GetTableData(DaumNewsCrawling.NewsData(), "Index", 1165, "article"); kkma = Kkma() print(kkma.nouns(testArticle)) okt = Okt() print(okt.nouns((testArticle))) mecab = Mecab() print(mecab.nouns(testArticle)) hannanum = Hannanum() print(hannanum.nouns(testArticle)) komoran = Komoran() print(komoran.nouns(testArticle)) # class TestMPProcess: # # _max = 0 # _onSucess = None # def __init__(self, n, onsucess): # self._max = n # self._onSucess = onsucess
class embd_answer: def __init__(self): self.mecab = Mecab() self.load_data() def pre_phrase(self, phrase): for how in HOW: phrase = phrase.replace(how, HOW_TOKEN) for why in WHY: phrase = phrase.replace(why, WHY_TOKEN) for d in DEL: phrase = phrase.replace(d, '') return phrase def load_data(self): self.sentence = [] with open('./data/training_data.txt', encoding='cp949') as f: lines = f.readlines() for line in lines: line = line.split('\t') line[0] = self.pre_phrase(line[0]) self.sentence.append((int(line[1].replace('\n','')), self.mecab.morphs(line[0].replace('\n','')))) # training_set = [x[1] for x in self.sentence] # self.model = FastText(training_set, size=32, window=5, min_count=1, iter=10000, workers = 8) # self.model.save('./data/model') # print('training finish') self.model = FastText.load('./data/model') self.l = [] for index, word in self.sentence: avg = 0 for j in word: avg += self.model[j] avg = avg / len(word) self.l.append((index, avg)) def infer(self, phrase): phrase = self.pre_phrase(phrase) phrase = self.mecab.morphs(phrase) qv=0 for i in phrase: try: qv += self.model[i] except: qv += np.zeros((32)) pass qv = qv / len(phrase) max_ = 0 index = 0 for i, refer in self.l: tmp = cosine_similarity(refer.reshape(1,-1), qv.reshape(1,-1)) if tmp > max_: max_ = tmp index = i # print(index + 1) return index def infer_file(self, path = './data/training_data.txt'): test = [] with open(path, encoding='cp949') as f: lines = f.readlines() for line in lines: test.append(line.replace('\n','')) for q in test: print(self.infer(q))
import re import json import math, struct, sys import os.path #from konlpy.tag import Kkma #_analyzer= Kkma() from konlpy.tag import Mecab _analyzer= Mecab() def xplit(value): return re.split('\r\n|\n', value) def parse_nouns(did, text, dic_terms, f): candidates = xplit(text.strip()) for candidate in candidates: if len(candidate): nouns = _analyzer.nouns(candidate) for noun in nouns: value = dic_terms.get(noun, 0) dic_terms[noun] = value + 1 #terms_list = list(dic_terms.keys()).sort() def forward_indexing():
def __init__(self): self.mecab = Mecab() self.load_data()
import json from konlpy.tag import Mecab from konlpy.tag import Hannanum from konlpy.tag import Kkma from konlpy.tag import Komoran from konlpy.tag import Twitter import time useclass = Mecab() FILEPATH = "./data.json" DATA = {} def readjson(fn): f = open(fn, 'r') js = json.loads(f.read()) f.close() return js def main(): start_time = time.time() global FILEPATH global DATA DATA = readjson(FILEPATH) i = 0 for data in DATA: i += 1 no = data['no']
from tokenizers import BertWordPieceTokenizer # Initialize a tokenizer tokenizer = BertWordPieceTokenizer() # Then train it! tokenizer.train(["./sample.csv"]) # Now, let's use it: encoded = tokenizer.encode( "미국에서는 여전히, 연준은 물론 정부와 의회 역시 신용경색 해소를 위해 다방면의 노력을 하고 있다. 하지만 그것은, 미 금융시스템의 붕괴는 모면케 해 줄 수 있을지언정, 순환적 경기침체까지 피해가게 만들 수는 없을 것 같다." ) print("WPM --------------") print(encoded.tokens) from konlpy.tag import Mecab print("Mecab --------------") mecab = Mecab() print( mecab.morphs( "미국에서는 여전히, 연준은 물론 정부와 의회 역시 신용경색 해소를 위해 다방면의 노력을 하고 있다. 하지만 그것은, 미 금융시스템의 붕 괴는 모면케 해 줄 수 있을지언정, 순환적 경기침체까지 피해가게 만들 수는 없을 것 같다." )) # And finally save it somewhere tokenizer.save(".", name="WPM")
def __init__(self, myDB): self.stopword = set() self.myDB = myDB self.tagger = Mecab(dicpath=r"C:\mecab\mecab-ko-dic")
class TextRank: def __init__(self, tokenizer=None, exceptional_stop_pos=[]): self.stop_pos = [ 'IC', 'JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JC', 'JX', 'XR', 'SF', 'SE', 'SSO', 'SSC', 'SC', 'SY', 'EC', 'EF', 'ETN', 'ETM', 'XSV', 'XSA', 'XSN', 'XPN' ] if not tokenizer: self.tokenizer = Mecab() else: self.tokenizer = tokenizer if not exceptional_stop_pos: self.stop_pos = [ x for x in self.stop_pos if x not in exceptional_stop_pos ] def pos_tagging(self, content, category="정치"): def subtokenize(pos_list, dct): pos_str = "[" + ", ".join([str(t) for t in pos_list]) + "]" for pattern in dct: src = ", ".join([str(i) for i in pattern[1]]) tgt = str(pattern[0]) pos_str = pos_str.replace(src, tgt) tokenized_text = eval(pos_str) return tokenized_text def group_by_pos(tokens, join_char=''): return (join_char.join([ t[0] for t in tokens ]), 'NNP' if len(set([t[1] for t in tokens])) > 2 else tokens[-1][1]) def gen_tokens( tokens, join_char='', group_by_pos_li=['NNP', 'NNG', 'SN', 'SH', 'SL', 'NNBC'], stop_pos=[]): #last_token = tokens[0] ret = [] li = [] for t in tokens: if t[1] in group_by_pos_li: li.append(t) else: if len(li) > 0: ret.append(group_by_pos(li, join_char)) if t[1] not in stop_pos: ret.append(t) li = [] if len(li) > 0: ret.append(group_by_pos(li, join_char)) return ret morphs_dict = pickle.load(open('morphs_dict.pickle', "rb")) comp_dict = pickle.load(open('comps_dict.pickle', "rb")) ret = [] for s in sent_tokenize(content): #ret.append((s, sent_li)) sent_li = [] sent_li.append([(w, subtokenize( subtokenize( gen_tokens(self.tokenizer.pos(w), stop_pos=self.stop_pos), morphs_dict[category]), comp_dict[category])) for w in s.split()]) #for w in s.split() : sent_li.append((w, gen_tokens(tokenizer.pos(w)))) ret.append((s, sent_li)) #ret.append((s, [t for t in gen_tokens(sent_li, join_char= ' ') if t[1] not in stop_pos])) return ret def keywords(self, text, n=10): tokens = self.pos_tagging(text) tokens = [t for s in tokens for w in s[1] for t in w] nodes = [ k for t in tokens for k in t[1] if (k[1][0] in ['N', 'V']) & (len(k[0]) > 1) ] tokens = [k for t in tokens for k in t[1]] def connect(nodes, tokens): window_size = 5 # coocurrence를 판단하기 위한 window 사이즈 설정 edges = [] for window_start in range(0, (len(tokens) - window_size + 1)): window = tokens[window_start:window_start + window_size] #edges.append([(window[i], window[j]) for i in range(window_size) for j in range(window_size) if ( (i > j) & (window[i] in nodes) & (window[j] in nodes))]) for i in range(window_size): for j in range(window_size): if (i > j) & (window[i] in nodes) & (window[j] in nodes): edges.append((window[i], window[j])) return edges graph = nx.diamond_graph() graph.clear() #처음 생성시 graph에 garbage node가 남아있어 삭제 graph.add_nodes_from(list(set(nodes))) #node 등록 graph.add_edges_from(connect(nodes, tokens)) #edge 연결 scores = nx.pagerank(graph) #pagerank 계산 rank = sorted(scores.items(), key=lambda x: x[1], reverse=True) #score 역순 정렬 return rank[:n] def print_keywords(self, text, n=10): print("Keyword : ") for k in self.keywords(text, n): print("{} - {}".format(k[0][0], k[1])) def summarize(self, text, n=3): tokens = self.pos_tagging(text) #자카드 유사도 계산 def jaccard_similarity(query, document): intersection = set(query).intersection(set(document)) union = set(query).union(set(document)) return len(intersection) / len(union) # 문장간 유사도 측정 (BoW를 활용 코사인 유사도 측정) def sentence_similarity(sentence1, sentence2): sentence1 = self.tokenizer.morphs( sentence1[0] ) #[t[0] for s in sentence1[1][0] for t in s[1] if t[1][0] in ['N','V'] ] sentence2 = self.tokenizer.morphs( sentence2[0] ) #.split()#[t[0] for s in sentence2[1][0] for t in s[1] if t[1][0] in ['N','V'] ] #print(sentence1) return jaccard_similarity(sentence1, sentence2) def sentences(doc): return [s[0].strip() for s in doc] def connect(doc): return [(start[0].strip(), end[0].strip(), sentence_similarity(start, end)) for start in doc for end in doc if start is not end] graph = nx.diamond_graph() graph.clear() #처음 생성시 graph에 garbage node가 남아있어 삭제 graph.add_nodes_from(sentences(tokens)) #node 등록 graph.add_weighted_edges_from(connect(tokens)) #edge 연결 scores = nx.pagerank(graph) #pagerank 계산 #print(scores) rank = sorted(scores.items(), key=lambda x: x[1], reverse=True) #score 역순 정렬 ssum = rank[:n] ranks = [] for s in ssum: ranks.append(s[0]) return ' '.join(ranks)
def calc_cfd(doc): # Calculate conditional frequency distribution of bigrams words = [w for w, t in Mecab().pos(doc)] bigrams = nltk.bigrams(words) return nltk.ConditionalFreqDist(bigrams)
def mecab_instance(): from konlpy.tag import Mecab m = Mecab() return m
from konlpy.tag import Mecab tokenizer = Mecab() txt_file = open( "/Users/angeonhui/Bert-abstractive-text-summarization/data/dataset/for_vocab/all_text_0216.txt", 'r') text_data = txt_file.read() txt_file.close() def whitespace_tokenize(data): data = data.strip() # 문자열의 맨앞, 맨끝 공백 지움 if not data: return [] tokens = data.split() # 문자열을 스페이스,탭,엔터 단위로 분리하여 배열에 집어넣음 return tokens output_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'] for wst in whitespace_tokenize(text_data): # wst : 공백,탭,엔터 기준 문자열 하나 count = 0 for token in tokenizer.morphs(wst): # token : wst를 형태소 분석한 토큰 하나 tk = token if count > 0: tk = "##" + tk if tk in output_tokens: # 토큰이 중복되면 저장하지 않음 continue output_tokens.append(tk) else: # count==0
def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.mecab_tokenizer = Mecab('../mecab-ko-dic-2.1.1-20180720')
from gensim.models import Word2Vec import pandas as pd from konlpy.tag import Mecab from tqdm import tqdm import pdb data = pd.read_csv("../data/train.txt", header=None, sep='\t') tokenizer = Mecab().morphs sentences = data[1] print("tokenize start") tokenized_texts = [] for sent in tqdm(sentences): try: tokenized_texts.append(tokenizer(sent)) except: pass print("tokenize end") pdb.set_trace() model = Word2Vec(tokenized_texts, size=200, window=3, min_count=1, workers=4) print("train model") model.save("mid.model") model.intersect_word2vec_format('./ko.bin') model.save("w2v.model") # model = Word2Vec.load('word2vec.model')
## 불용어 제거 import pandas as pd from tqdm import tqdm from konlpy.tag import Mecab mecab = Mecab(dicpath=r"C:/mecab/mecab-ko-dic") df = pd.read_excel('./data/작업표준 목록_조선외.xlsx') target = df["표준서명"] targets = [] results = [] stop_word = "전 난 일 걸 뭐 줄 만 건 작업 분 위 개 끝 송 잼 이거 부 동 번 중 듯 차 때 게 내 말 나 수 거 점 것 등 측 의 급 후 간 단 시 곳" stop_word = stop_word.split(' ') # print(stop_word) ########## for sentence in tqdm(target): result = [] for noun in mecab.nouns(sentence): if noun not in stop_word: result.append(noun) targets.append(sentence) results.append(result) summary = [targets, results] result_df = pd.DataFrame(summary) result_df = result_df.T print(result_df.head(20))
import os import json from konlpy.tag import Mecab from konlpy.tag import Hannanum from konlpy.tag import Kkma from konlpy.tag import Komoran from konlpy.tag import Twitter import pymysql import math import time import operator import sys FILEPATH="./result.json" DATA={} cls=list() cls.append(Mecab()) cls.append(Komoran()) cls.append(Twitter()) maxfreq=dict() site=sys.argv[1] site=site.strip() def TF(nouns): allsize=len(nouns) ret=dict() for noun in nouns: if(len(noun)<2): continue ret[noun]=1 return ret def TFIDF(allword, tf):
"test_han_20181106-20181113", "test_oh_20181106-20181113"]''' name_set = [ "동아일보", "경향신문", "[조중동]", "[한경오]", "조선일보", "중앙일보", "한겨례", "오마이뉴스" ] porgress_set = [False, True, False, True, False, False, True, True] for k in range(0, len(test_set)): news = name_set[k] test = test_set[k] date = test.split("_")[2] db_date = date.split("-")[1] df = pd.read_table( "train_20180503-20181119") # training set (이전 기사들, 여러 신문) pp = pd.read_table(test) # test set (최근 기사, 하나의 신문) mecab = Mecab() # reviews : title이 tokenizee된 list가 원소 # labels : reviews와 같은 index의 title의 label # _p : test set # all_tokens : training set 통해 test set의 판단 하려면 test set 의 토큰들이 같이 indexing 되어야함 # unique_tokens : token의 개수 체크 reviews = [] reviews_p = [] labels = [] labels_p = [] all_tokens = [] unique_tokens = dict() # training set tokenize for i in range(len(df)):
def language_processing(input_data): mecab = Mecab() # 명사에 대한 yn 데이터 저장 # 날개가 있을 경우, check_data['날개'] == 1 check_data = dict() for name in [input_neuron.name for input_neuron in InputLayer.all_neuron]: # 우선 check_data 의 모든 데이터를 모른다는 조건으로 초기화 check_data[name] = 0 # [*range(3)] is same with [0, 1, 2] word_list, pos_list = zip(*[(word, pos) for word, pos in mecab.pos(input_data) if pos in ['VV', 'VA', 'NNG', 'JC', 'SC', 'MAG', 'VX']]) # 이미 처리한 word 데이터를 False 로 바꾸기 위해 # 데이터 변경을 지원하는 리스트로 형 변환. (기존에는 tuple) word_list = list(word_list) # 같은 이유 pos_list = list(pos_list) # 부정적인 성분 부사를 가지고 있는 형용사를 치환 # 날개가 안 보인다 --> 날개가 없다 yn_dict = { '있': 1, '들리': 1, '보이': 1, '없': -1, '모르': 0 } """ for index in range(len(pos_list)): if pos_list[index] == 'MAG' and word_list[index] == '안': # 성분 부사 이면서 부정 부사 일 경우 word_list[index] = '없' # 부정으로 치환 for i in range(len(pos_list[index:])): # 부정 부사 뒷 부분 탐색 if pos_list[i] in ['VV', 'VA']: # '있', '없' 등의 데이터가 나올 경우 try: word_list[i] = yn_change[word_list[i]] # yn_change 를 이용해 반전시킨다 except KeyError: word_list pass """ # 형용사를 먼저 탐색하고, 주변 명사를 그룹화 하는 방식으로 처리한다. # pos 데이터 중에서 있,없 등의 수식어를 가져옴 for index in range(len(pos_list)): if pos_list[index] == 'MAG' and word_list[index] == '안': # 성분 부사 이면서 부정 부사 일 경우 word_list[index] = '없' # 부정으로 치환 pos_list[index] = 'VA' # pos 데이터도 맞게 변경 if pos_list[index] in ['VA', 'VV']: # if pos is yn data # 해당 명사에 서술한 내용에 따라 InputLayer Neuron 에 입력함 try: yn = yn_dict[word_list[index]] except KeyError: yn = 0 finally: # 뒤에 부정적인 보조용언이 올 경우 # ex) ~하지 '않'는다 # 다음 인덱스 부터 탐색 tmp_index = index + 1 while tmp_index < len(pos_list): if pos_list[tmp_index] == 'VX': if word_list[tmp_index] == '않': yn *= -1 break elif pos_list[tmp_index] == 'NNG': break # 다음 명사가 나오면 종료 tmp_index += 1 # 그 전까지의 모든 명사를 위 yn 데이터로 저장 for nng in [word_list[i] for i in range(index) if pos_list[i] == 'NNG']: # 이미 처리한 word 일 경우 if nng is False: continue else: try: check_data[nng] except KeyError: pass else: check_data[nng] = yn # 처리한 word 들은 False 으로 치환. word_list[:index] = ([False] * index) return check_data
import sys import codecs import re from sys import stdin from konlpy.tag import Mecab from konlpy.tag import Kkma # MeCab installation needed mecab = Mecab() UTF8Reader = codecs.getreader('utf8') sys.stdin = UTF8Reader(sys.stdin) jpatt = re.compile('J.*') spatt = re.compile('XSN') fpatt = re.compile('SF') upatt = re.compile('UNKNOWN') vpatt = re.compile('V.*') xpatt = re.compile('XSV') npatt = re.compile('N.*') x2patt = re.compile('XSA') ms_reg = r'\/{1,}' ms_reg2 = r'\s{2,}' log_epch = 10000 f_size = 448453 dbg_line=None testmod=None
def __init__(self, app): self.app = app self.mecab = Mecab() self.load_models()
#!/usr/bin/python3 from konlpy.tag import Mecab import sys import argparse if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("file", nargs="*", default=sys.stdin, type=argparse.FileType('r', encoding='utf-8')) args = parser.parse_args() mecab = Mecab() files = args.file if type(args.file) is list else [args.file] for f in files: for line in f.readlines(): print(' '.join(mecab.morphs(line.strip())))
class WordTokenizer(Tokenizer): """ Word Tokenizer * Args: name: tokenizer name [treebank_en|spacy_en|mecab_ko|bert_basic] * Kwargs: flatten: return type as flatten list split_with_regex: post split action. Split tokens that the tokenizer cannot split. """ def __init__(self, name, sent_tokenizer, config={}, split_with_regex=True): super(WordTokenizer, self).__init__(name, f"word-{name}+{sent_tokenizer.cache_name}") self.config = config self.sent_tokenizer = sent_tokenizer self.word_tokenizer = None self.split_with_regex = split_with_regex if split_with_regex: self.extra_split_chars_re = self.make_split_regex_expression() def make_split_regex_expression(self): """ Apply a small amount of extra splitting to the given tokens, this is in particular to avoid UNK tokens due to contraction, quotation, or other forms of puncutation. I haven't really done tests to see if/how much difference this makes, but it does avoid some common UNKs I noticed in SQuAD/TriviaQA """ extra_split_chars = ( "-", "£", "€", "¥", "¢", "₹", "*", "\u2212", "\u2014", "\u2013", "/", "~", '"', "'", "\ud01C", "\u2019", "\u201D", "\u2018", "\u00B0", ".", ":", ) extra_split_tokens = ( "``", "(?<=[^_])_(?=[^_])", # dashes w/o a preceeding or following dash, so __wow___ -> ___ wow ___ "''", "[" + "".join(extra_split_chars) + "]", ) return re.compile("(" + "|".join(extra_split_tokens) + ")") @overrides def _tokenize(self, text, unit="text"): """ Text -> word tokens """ if type(text) != str: raise ValueError(f"text type is must be str. not {type(text)}") if unit == "sentence": tokens = getattr(self, f"_{self.name}")(text) else: sentences = self.sent_tokenizer.tokenize(text) tokens = [ getattr(self, f"_{self.name}")(sentence) for sentence in sentences ] if self.split_with_regex and self.name != "spacy_en": tokens = self._split_with_regex(tokens) return list(common_utils.flatten(tokens)) def _split_with_regex(self, sentences): for i, sentence in enumerate(sentences): sentences[i] = [ token for token in self._post_split_tokens(sentence) ] return sentences def _post_split_tokens(self, tokens): return [[x for x in self.extra_split_chars_re.split(token) if x != ""] for token in tokens] """ Tokenizers """ def _space_all(self, text): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F: return True return False prev_is_whitespace = True tokens = [] for char in text: if is_whitespace(char): prev_is_whitespace = True else: if prev_is_whitespace: tokens.append(char) else: tokens[-1] += char prev_is_whitespace = False return tokens def _treebank_en(self, text): if self.word_tokenizer is None: import nltk self.word_tokenizer = nltk.TreebankWordTokenizer() return [ token.replace("''", '"').replace("``", '"') for token in self.word_tokenizer.tokenize(text) ] def _spacy_en(self, text): if self.word_tokenizer is None: from claf.tokens.tokenizer.utils import load_spacy_model_for_tokenizer self.word_tokenizer = load_spacy_model_for_tokenizer( self.extra_split_chars_re) def _remove_spaces(tokens): return [token.text for token in tokens if not token.is_space] return _remove_spaces(self.word_tokenizer(text)) def _bert_basic(self, text): if self.word_tokenizer is None: from transformers import BasicTokenizer self.word_tokenizer = BasicTokenizer(**self.config) return self.word_tokenizer.tokenize(text) def _mecab_ko(self, text): if self.word_tokenizer is None: from konlpy.tag import Mecab self.word_tokenizer = Mecab() return self.word_tokenizer.morphs(text)
def analyzing_morphem(content_list): mecab = Mecab() for idx, doc in enumerate(content_list): if idx % 5000 == 0 : print 'Morphem Analysis on %d' % idx yield ' '.join([part for part, pos in mecab.pos(doc.decode('utf-8'))]).encode('utf-8')