def tokenize_okt_noscreen(df): okt = Twitter() okt.add_dictionary(call_userword(), 'Noun') stopwords = load_wordset('./tokenizer/korean_stopword.txt') #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt') stopwords = list(stopwords) df['content_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos( x['content'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1) df['title_token'] = df.progress_apply(lambda x: [t[0] for t in okt.pos( x['title'], stem=True) if t[1] in ['Noun', 'Verb', 'Adjective'] and t[0] not in stopwords and len(t[0]) != 1], axis=1) return df
def tokenize_okt_noscreen(df): okt = Twitter() okt.add_dictionary(call_userword(), 'Noun') stopwords = load_wordset('./tokenizer/korean_stopword.txt') #stopwords = stopwords | load_wordset('./tokenizer/korean_screen.txt') stopwords = list(stopwords) df['content_token'] = df.progress_apply( lambda x: text_tokenize(x['content'], okt, stopwords), axis=1) df['title_token'] = df.progress_apply( lambda x: text_tokenize(x['title'], okt, stopwords), axis=1) return df
def Tokenizer(data): import pandas as pd from ckonlpy.tag import Twitter twitter = Twitter() #사용자 사전 추가 txt = pd.read_csv('사용자 사전.txt', sep='\n') txt = txt['<사용자 사전>'] for line in txt: twitter.add_dictionary(txt, 'Noun') # 데이터 가져오기 data = data new_hashtags = data.hashtags.copy() # 토큰화 for i in range(len(new_hashtags)): new_hashtags[i] = ' '.join(new_hashtags[i]) tokenized = [] for sentence in new_hashtags: tokens = twitter.morphs(sentence) tokenized.append(tokens) # 연속된 중복 제거 new_tokenized = [] for x in range(len(tokenized)): temp = [] for y in range(len(tokenized[x]) - 1): if tokenized[x][y] != tokenized[x][y + 1]: temp.append(tokenized[x][y]) new_tokenized.append(temp) return new_tokenized
class RawTagger: def __init__(self, textIter, tagger=None): # 전처리 1 (형태소 분석 단어 추가) -> 형태소 분석기가 단어로 인식 못하는 단어들을 추가 f = open('형태소 보완.txt') dd = f.read() a = dd.split('\n') if tagger: self.tagger = tagger else: self.tagger = Twitter() self.tagger.add_dictionary(a, 'Noun') if type(textIter) == str: self.textIter = textIter.split('\n') else: self.textIter = textIter self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))') def __iter__(self): for line in self.textIter: ch = self.rgxSplitter.split(line) for s in map(lambda a, b: a + b, ch[::2], ch[1::2]): if not s: continue yield self.tagger.pos(s)
#형태소 분석 import os import json #from konlpy.tag import Okt from ckonlpy.tag import Twitter BASE_DIR = os.path.dirname(os.path.abspath(__file__)) file = open(os.path.join(BASE_DIR + '/t05/news1.txt'), 'r', encoding='UTF8') text = file.read() file.close() #okt = Okt() twitter = Twitter() twitter.add_dictionary('K리그', 'Noun') content = twitter.morphs(text) num = 1 voca_dict = dict() for word in content: voca_dict[num] = word num = num + 1 with open(os.path.join(BASE_DIR + '/t06', 'vocab.json'), 'w+', encoding='UTF-8-sig') as json_file: json.dump(voca_dict, json_file, ensure_ascii=False)
def twitter(self): cr_name = 'twitter' # 이미지파일 저장 장소 확인 save_path = os.path.join(self.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(self.img_path): os.mkdir(save_path) else: os.mkdir(self.img_path) os.mkdir(save_path) text_save_path = os.path.join(self.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(self.text_path): os.mkdir(text_save_path) else: os.mkdir(self.text_path) os.mkdir(text_save_path) keyword = self.scan_name # if self.platform == 'linux': # print('System platform : Linux') # self.driver_path = './static/lib/webDriver/chromedriver_lnx' # from pyvirtualdisplay import Display # self.display = Display(visible=0, size=(800, 600)) # self.display.start() # 웹 셋팅 if self.platform == 'linux': display = Display(visible=0, size=(1024, 768)) display.start() options = Options() options.binary_location = "/usr/bin/google-chrome" # chrome_options = webdriver.ChromeOptions() options.headless = True options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') chrome = webdriver.Chrome(executable_path=self.driver_path, options=options) else: chrome = self.generate_chrome(driver_path=self.driver_path, headless=self.headless, download_path=self.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Twitter 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword) chrome.get(url) chrome.implicitly_wait(30) body = chrome.find_element_by_css_selector('body') text2 = chrome.find_elements_by_css_selector( '#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div' ) result = [] for i in range(10): for q in range(3): body.send_keys(Keys.PAGE_DOWN) time.sleep(1) for ttt in text2: result.append(re.sub('\n', '', ttt.text)) print(result) time.sleep(1) if self.platform == 'linux': chrome.close() display.stop() t = Twitter() t.add_dictionary(self.sajun(), 'Noun') print('단어사전 추출완료') tokens_ko = [] for i in range(len(result)): tokens_ko.append(t.nouns(result[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) print('형태소분석 완료!') ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) # 텍스트파일에 댓글 저장하기 file = open(text_save_path + '/twitter{}.txt'.format(self.date), 'w', encoding='utf-8') for review in result: file.write(review + '\n') file.close() tmp_data = dict(data) wordcloud = WordCloud( font_path=self.fontPath, background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/twitter_{}.png".format(self.date), bbox_inces='tight', dpi=400, pad_inches=0)
from konlpy.tag import Okt from ckonlpy.tag import Twitter, Postprocessor from ckonlpy.utils import load_wordset, load_ngram # nltk.download('punkt') # nltk.download('stopwords') okt = Okt() twitter = Twitter() stopwordsKR = load_wordset('cleansing_data/korean_stopwords.txt', encoding='ANSI') customStopwordsEN = load_wordset('cleansing_data/english_stopwords.txt', encoding='ANSI') stopwordsEN = customStopwordsEN.union(set(stopwords.words('english'))) ngrams = load_ngram('cleansing_data/korean_ngram.txt') userdicts = load_wordset('cleansing_data/korean_user_dict.txt') twitter.add_dictionary(list(userdicts), 'Noun', force=True) def getJobGroups(): res = requests.get( 'https://www.wanted.co.kr/wdlist/518?country=kr&job_sort=job.latest_order&years=-1&locations=all' ) html = res.text soup = BeautifulSoup(html, "html.parser") jobGroups = [] for elements in soup.find("div", class_="_2h5Qtv_8mK2LOH-yR3FTRs").find_all("li"): href = elements.find("a")["href"] span = elements.find("span") jobGroup = {
def token(self, title, ccontent, creplies): memory = psutil.Process(os.getpid()) T_OR_title = [] T_title = [] T_OR_ccontent = [] T_ccontent = [] T_OR_creplies = [] T_creplies = [] twitter = Okt() # 트위터 형태소 사전을 사용하기 위해 초기화 twitter.add_dictionary('백래시', 'Noun') twitter.add_dictionary('문재앙', 'Noun') #### 타이틀 토큰화 #print('1') for i in range(len(title)): a = twitter.pos(title[i]) b = [] #print('title[i]',i,title[i]) for j in range(len(a)): if a[j][1] != 'Punctuation': # 오류로 'Punctuation'에 해당하는 튜플 제거 b.append(a[j]) #print('3',j) T_OR_title.append(b) T_title.append(twitter.morphs(title[i])) #### ccontent 토큰화 try: c = twitter.pos(str(ccontent[i])) d = [] # print('ccontent[i]',i, ccontent[i]) for w in range(len(c)): if c[w][1] != 'Punctuation': # 오류로 'Punctuation'에 해당하는 튜플 제거 d.append(c[w]) #print('4',w) T_OR_ccontent.append(d) T_ccontent.append(twitter.morphs(str(ccontent[i]))) except RuntimeError as e: T_OR_ccontent.append('') T_ccontent.append(twitter.morphs('')) ### 댓글 토큰화 #print('creplies[i]',i,creplies[i]) if type(creplies[i]) == str: # string형 댓글 토큰화 a = [creplies[i]] # string을 리스트로 변경 e = twitter.pos(str(a)) f = [] for u in range(len(e)): if e[u][1] != 'Punctuation': f.append(e[u]) elif e[u][1] != 'KoreanParticle': f.append(e[u]) else: break #print('5',u) T_OR_creplies.append(f) T_OR_creplies.append(twitter.pos(str(a))) T_creplies.append(twitter.morphs(str(a))) else: temp = [] temp2 = [] x = [] for n in range(len(creplies[i])): ### 리스트로 반환되는 댓글 h = twitter.pos(creplies[i][n]) #print('6',n) for z in range(len(h)): if h[z][1] != 'Punctuation': x.append(h[z]) elif h[z][1] != 'KoreanParticle': x.append(h[z]) else: break # print('7',z) # print('8',) #print('h',z,h) temp.append(x) temp2.append(twitter.morphs(creplies[i][n])) T_OR_creplies.append(temp) T_creplies.append(temp2) return T_OR_title, T_title, T_OR_ccontent, T_ccontent, T_OR_creplies, T_creplies
from ckonlpy.tag import Twitter from konlpy.tag import Hannanum, Kkma, Komoran, Okt from eunjeon import Mecab test_text = "확진자와 접촉자는 다중이용시설 이용을 삼가하고, 사회적 거리두기 운동에 동참하며, 진료소와 마스크 착용을 자제해주시기 바랍니다." # Customized Konlpy twitter = Twitter() twitter.add_dictionary(["확진자", "접촉자", "다중이용시설", "사회적", "거리두기", "진료소"], "Noun") twitter.add_dictionary(["드립니다", "하시기", "해주시고", "해주시기", "지켜주십시오"], "Verb") print(f"Customized Konlpy : {twitter.nouns(test_text)}") # Hannanum hannanum = Hannanum() print(f"Hannanum : {hannanum.nouns(test_text)}") # Kkma kkma = Kkma() print(f"Kkma : {kkma.nouns(test_text)}") # Komoran komoran = Komoran() print(f"Komoran : {komoran.nouns(test_text)}") # Okt okt = Okt() print(f"Okt : {okt.nouns(test_text)}") # Mecab mecab = Mecab() print(f"Mecab : {mecab.nouns(test_text)}")
with open('bitcoin_news.txt', 'r', encoding='utf8') as f: content = f.read() # 내용에서 불필요한 특수문자, 기호 등 제거 filtered_content = content.replace('.', '').replace(',', '').replace( "'", "").replace('·', ' ').replace('=', '').replace('"', '') filtered_content = re.sub(r'▶.*', '', filtered_content) filtered_content = re.sub(r'[^\.\?\!\w\d\s]', '', filtered_content) print(filtered_content) from ckonlpy.tag import Twitter twitter = Twitter() # 가상통화, 아시아경제 등 미등록 단어를 사전에 추가로 등록 twitter.add_dictionary('가상통화', 'Noun') twitter.add_dictionary('아시아경제', 'Noun') twitter.add_dictionary('한동안', 'Noun') twitter.add_dictionary('블리클리', 'Noun') twitter.add_dictionary('공동창립자', 'Noun') twitter.add_dictionary('부크바', 'Noun') # 형태소 분석 twitter_morphs = twitter.pos(filtered_content) # 명사만 추출하기 Noun_words = [] for word, pos in twitter_morphs: if pos == 'Noun': Noun_words.append(word) print(Noun_words)
from ckonlpy.utils import load_replace_wordpair replace = load_replace_wordpair('postprocess/replace.txt') from ckonlpy.utils import load_ngram ngrams = load_ngram('postprocess/ngrams.txt') Okt = Okt() twitter = Twitter() new_nouns = [] with open('preprocess/dictionary.txt', encoding='utf8') as fd: for line in fd: new_nouns.append(line.strip('\n')) twitter.add_dictionary(new_nouns, 'Noun') passtags = {'Noun'} postprocessor = Postprocessor( base_tagger=twitter, stopwords=stopwords, #passwords = passwords, passtags=passtags, replace=replace, ngrams=ngrams) token = [] nouns = []
# 브랜드 이름 명사로 저장 brand_set = np.array( [re.sub('[^가-힣]', '', x) for x in product_DB.brandname.unique()]) brand_set = pd.Series([x for x in brand_set if x != '']) # 특수 이름 추가 brand_set = brand_set.append(pd.Series([ '비씨데이션', '파데', '다크닝', '지속력', '밀착력', '피부톤', '커버력', '쿨톤', '웜톤', '결보정', '코끼임' ]), ignore_index=True) # 트위터 tokenizer loading 및 업로드 twitter = Twitter() twitter.add_dictionary(brand_set, 'Noun') # 원래 tokenizer 업로드 ori_twitter = original_Twitter() #### emoji2text preprocessed_text = emoji2text(review_DB.text) #### upper & 제품 매핑 preprocessed_text = pd.Series(preprocessed_text).apply(lambda x: x.upper()) bow = preprocessed_text.apply(lambda x: '◈'.join([ token + '╹' + pos if pos in ['Noun'] else ori_twitter. pos(token, stem=True, norm=True)[0][0] + '╹' + ori_twitter.pos( token, stem=True, norm=True)[0][1] for token, pos in twitter.pos(x) ])) ### token normalizing p_normal = {
class Social_analysis(): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) def __init__(self): self.twitter = Twitter() def DB_to_table(self, DBname='intake', keyword='intake'): import pymssql import pandas.io.sql as pdsql import pandas as pd self.query = \ """ SELECT user_id, created_at, main_text, hashtags, comments, likes, current_url FROM instaPost WHERE keyword = '{}' """.format(keyword) conn = pymssql.connect( "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname) df = pdsql.read_sql_query(self.query, con=conn) # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map)) # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")) conn.close() self.raw_data = df.as_matrix() def pickle_to_table(self, filename): with open(filename, 'rb') as f: data = pickle.load(f) data = data[1:] for idx, i in enumerate(data): data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map) data[idx][3] = '/'.join(i[3]) data[idx][4] = '/'.join(i[4]) self.raw_data = np.array(data) def hashtags_split(self, hashtags): hashtags_split = [] for i in hashtags: hashtags_split.append(i.split('/')) hashtags_list = [] for i in hashtags_split: temp = [] for j in i: if self.isHangul(j): t_hashtags = j.translate(self.non_bmp_map) temp.append(t_hashtags) hashtags_list.append(temp) self.hashtags_list = hashtags_list return hashtags_list def add_keyword_dic(self, keyword_list, tag='Noun'): for i in keyword_list: if type(i) == tuple: self.twitter.add_dictionary(i[0], i[1]) else: self.twitter.add_dictionary(i, tag) def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']): morph_list = [] noun_list = [] adj_list = [] verb_list = [] nav_list = [] for j in text_list: parsed = self.twitter.pos(j) temp = [] n_temp = [] adj_temp = [] verb_temp = [] nav_temp = [] for i in parsed: if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): temp.append(i) if i[1] == 'Noun': n_temp.append(i[0]) nav_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) nav_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) nav_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) morph_list.append(temp) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list.append(nav_temp) return morph_list, nav_list, noun_list, adj_list, verb_list def pos_extractor(self, parsed): noun_list = [] adj_list = [] verb_list = [] nav_list = [] for i in parsed: n_temp = [] adj_temp = [] verb_temp = [] nav_temp = [] if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): if i[1] == 'Noun': n_temp.append(i[0]) nav_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) nav_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) nav_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list.append(nav_temp) return nav_list, noun_list, adj_list, verb_list def merge_list(self, tokenized_list): return [j for i in tokenized_list for j in i] def join_list(self, tokenized_list): joined_list = [] for idx, i in enumerate(tokenized_list): joined_list.append(" ".join(i)) return joined_list def split_list(self, untokenized_list): hashtag_splited = [] for idx, i in enumerate(untokenized): hashtag_splited.append(i.split('/')) return hastag_splited def join_underbar(self, morph_list): all_list = [] post_list = [] for i in morph_list: for j in i: post_list.append(j[0] + '_' + j[1]) all_list.append([(' , ').join(post_list)]) post_list = [] all_list = np.array(all_list) return all_list def word_substitute(self, dataset, sublist): dataset = copy.deepcopy(dataset) sub_book = dict() for i in sublist: for j in i['sub_words']: sub_book[j] = i['main'] gc.collect() for n, i in enumerate(dataset): dataset[n] = [sub_book.get(item, item) for item in i] del sub_book gc.collect() return dataset def word_delete(self, dataset, del_list): dataset = copy.deepcopy(dataset) for n, line in enumerate(dataset): dataset[n] = [i for i in line if i not in del_list] return dataset def isHangul(self, text): encText = text hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) return hanCount > 0 def convert_list(self, *tokenized_list): input_length = len(tokenized_list) lists = [[] for i in range(input_length)] for idx, li in enumerate(tokenized_list): for j in li: lists[idx].append(['/'.join(j)]) converted_array = np.array(lists[0]) for idx in range(input_length): try: converted_array = np.concatenate( (converted_array, lists[idx + 1]), axis=1) except Exception as e: print(e, '끝') return converted_array def make_df(self, converted_array): df = pd.DataFrame(np.hstack( (intake.raw_data[:, :3], converted_array, intake.raw_data[:, 3:])), index=None) return df # 키워드 리스트 중 하나라도 있는 경우 def word_check_or(self, text, keywords): if any(word in text for word in keywords): return True else: return False # 키워드 리스트에 있는 단어가 모두 있는 경우 def word_check_and(self, text, keywords): if all(word in text for word in keywords): return True else: return False def word_check(self, method='and', keywords=[], df=None, column_name=None, filter_TF=True): if method == 'and': df['flag'] = df[column_name].apply( lambda x: self.word_check_and(x, keywords)) return df[df.flag == filter_TF] if method == 'or': df['flag'] = df[column_name].apply( lambda x: self.word_check_or(x, keywords)) return df[df.flag == filter_TF]
def Daum(self): cr_name = 'daum' # 이미지파일 저장 장소 확인 save_path = os.path.join(self.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(self.img_path): os.mkdir(save_path) else: os.mkdir(self.img_path) os.mkdir(save_path) text_save_path = os.path.join(self.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(self.text_path): os.mkdir(text_save_path) else: os.mkdir(self.text_path) os.mkdir(text_save_path) # 다음뉴스 헤드라인 긁어오기 http = [] print('Daum 접속 중') httz = 'https://media.daum.net/ranking/popular/?regDate={}'.format( self.date) res = requests.get(httz) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('#mArticle > div.rank_news > ul.list_news2') body = body[0].find_all('a') for i in range(len(body)): t = body[i].get('href') http.append(t) # 중복제거 http = list(set(http)) files = pd.DataFrame() for i in range(len(http)): res = requests.get(http[i]) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('.article_view')[0] files = files.append( pd.DataFrame( { 'Title': soup.find('div', attrs={ 'class': 'head_view' }).h3.text, 'Contents': " ".join(p.get_text() for p in body.find_all('p')), 'link': http[i] }, index=[i])) text2 = files.Contents # 텍스트파일에 댓글 저장하기 files.to_csv(text_save_path + '/다음뉴스종합_{}.csv'.format(self.date), index=False, encoding='utf-8') print('다음 텍스트 저장완료!') t = Twitter() t.add_dictionary(self.sajun(), 'Noun') print('형태소 사전 업로드 완료!!') tokens_ko = [] for i in range(len(text2)): tokens_ko.append(t.nouns(text2[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) print('nltk 완료') # 다음뉴스는 50페이지 긁어오는거라서 1글자는 삭제했음. 필요한건 바로바로 보고서 사전에 추가해서 태깅 다시해야함. data_1 = [] for i in range(len(data)): for q in range(0, 1, 1): if len(data[i][0]) >= 2: data_1.append(data[i]) tmp_data = dict(data_1) print('wordcloud 실행') wordcloud = WordCloud( font_path=self.fontPath, background_color='white', max_words=230).generate_from_frequencies(tmp_data) print('wordcloud 실행!!!') plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/daum_{}.png".format(self.date), bbox_inces='tight', dpi=400, pad_inches=0)
def Naver(self): cr_name = 'naver' # 이미지파일 저장 장소 확인 save_path = os.path.join(self.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(self.img_path): os.mkdir(save_path) else: os.mkdir(self.img_path) os.mkdir(save_path) text_save_path = os.path.join(self.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(self.text_path): os.mkdir(text_save_path) else: os.mkdir(self.text_path) os.mkdir(text_save_path) # 네이버 헤드라인 가져오는소스 result = [] res = [] # 웹 셋팅 if self.platform == 'linux': display = Display(visible=0, size=(800, 600)) display.start() options = Options() options.binary_location = "/usr/bin/google-chrome" # chrome_options = webdriver.ChromeOptions() options.headless = True options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') chrome = webdriver.Chrome(executable_path=self.driver_path, options=options) else: chrome = self.generate_chrome(driver_path=self.driver_path, headless=self.headless, download_path=self.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Naver 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format( self.date) chrome.get(url) chrome.implicitly_wait(30) # scroll(3) for sun in range(4, 10): pr = chrome.find_elements_by_xpath( '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun)) for p in pr: result.append(p.find_elements_by_tag_name('a')) # print(result) for i, q in enumerate(result): for e in q: res.append(e.get_attribute('href')) http = list(set(res)) len(http) https = [] for idx in range(len(http)): if http[idx].find('popularDay') >= 0: continue else: https.append(http[idx]) files = pd.DataFrame() if self.platform == 'linux': chrome.close() display.stop() for i in range(len(https)): res = requests.get(https[i]) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('._article_body_contents') files = files.append( pd.DataFrame( { 'Title': soup.find('div', attrs={ 'class': 'article_info' }).h3.text, 'Contents': re.sub( ' ', '', re.sub( ' ', '', re.sub( '\t', '', self.cleanText(body[0].text)[ (self.cleanText(body[0].text) ).find('{}') + 2:]))), 'link': https[i] }, index=[i])) text2 = files.Contents # 텍스트파일에 저장 csv files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(self.date), index=False, encoding='utf-8') # ------------------------------------- # 사전만들기 t = Twitter() t.add_dictionary(self.sajun(), 'Noun') tokens_ko = [] for i in range(len(text2)): tokens_ko.append(t.nouns(text2[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) data_1 = [] for i in range(len(data)): for q in range(0, 1, 1): if len(data[i][0]) >= 2: data_1.append(data[i]) tmp_data = dict(data_1) wordcloud = WordCloud( font_path=self.fontPath, background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/naver_{}.png".format(self.date), bbox_inces='tight', dpi=400, pad_inches=0)
class PreprocessingText: def help(self): print("******PreprocessingText******") print("1) make_content_re(df['컬럼이름'](Series)) : 입력받은 열을 전처리 후 시리즈로 반환") print("2) add_noun_dict('list') : 명사 사전에 단어 추가") print("3) add_stopwords('list') : 불용어 사전에 단어 추가") print("4) tokenize(df['컬럼이름'](Series)) : 입력받은 열을 토큰화한 후 시리즈로 반환") print( "5) change_similar_words(토큰화된 문서(Series), 유의어 사전(dictionary)) : 유의어 사전을 기반으로 문서 내 유의어를 대표어로 변환하고, 변환된 문서를 시리즈로 반환한다." ) print("*****************************") def __init__(self): self.reg_reporter = re.compile('[가-힣]+\s[가-힣]*기자') # 기자 self.reg_email = re.compile( '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$') # 이메일 self.reg_eng = re.compile('[a-z]+') # 소문자 알파벳, 이메일 제거용, 대문자는 남겨둔다 self.reg_chi = re.compile("[\u4e00-\u9fff]+") # 한자 self.reg_sc = re.compile( "·|…|◆+|◇+|▶+|●+|▲+|“|”|‘|’|\"|\'|\(|\)|\W+") # 특수문자 self.reg_date = re.compile( '\d+일|\d+월|\d+년|\d+시|\d+분|\(현지시간\)|\(현지시각\)|\d+') # 날짜,시간,숫자 self.twitter_obj = Twitter() self.stopwords = [] self.noun_list = [] def preprocessing(self, doc): tmp = re.sub(self.reg_reporter, '', doc) tmp = re.sub(self.reg_email, '', tmp) tmp = re.sub(self.reg_eng, '', tmp) tmp = re.sub(self.reg_chi, '', tmp) tmp = re.sub(self.reg_sc, ' ', tmp) tmp = re.sub(self.reg_date, '', tmp) return tmp def make_content_re(self, data): pp_data = data.apply(self.preprocessing) return pp_data def add_noun_dict(self, noun_list): self.twitter_obj.add_dictionary(noun_list, 'Noun') self.noun_list.extend(noun_list) print("추가한 명사") print(noun_list) def add_stopwords(self, stopword_list): self.stopwords.extend(stopword_list) print("추가한 불용어") print(stopword_list) def change_similar_words(self, tokenized_docs, similar_words_dict): changed_docs = [] for doc in tokenized_docs: changed_doc = [] for word in doc: if word in similar_words_dict.keys(): changed_doc.append(similar_words_dict[word]) else: changed_doc.append(word) changed_docs.append(changed_doc) return changed_docs def tokenize(self, data): print('추가한 명사:', self.noun_list) print('불용어: ', self.stopwords) tokenized_doc = data.apply(lambda x: self.twitter_obj.nouns(x)) tokenized_doc_without_stopwords = tokenized_doc.apply( lambda x: [item.lower() for item in x if item not in self.stopwords]) tokenized_data = tokenized_doc_without_stopwords return pd.Series(tokenized_data)
def naver(): from selenium import webdriver import re from selenium.webdriver.common.keys import Keys import time cr_name = 'naver' # 이미지파일 저장 장소 확인 save_path = os.path.join(Main.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(Main.img_path): os.mkdir(save_path) else: os.mkdir(Main.img_path) os.mkdir(save_path) text_save_path = os.path.join(Main.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(Main.text_path): os.mkdir(text_save_path) else: os.mkdir(Main.text_path) os.mkdir(text_save_path) # 네이버 헤드라인 가져오는소스 date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) result = [] res = [] # 웹 셋팅 chrome = chromedriver.generate_chrome(driver_path=Main.driver_path, headless=Main.headless, download_path=Main.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Naver 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format( date) chrome.get(url) time.sleep(2) # scroll(3) for sun in range(4, 10): pr = chrome.find_elements_by_xpath( '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun)) for p in pr: result.append(p.find_elements_by_tag_name('a')) # print(result) for i, q in enumerate(result): for e in q: res.append(e.get_attribute('href')) http = list(set(res)) len(http) https = [] for idx in range(len(http)): if http[idx].find('popularDay') >= 0: continue else: https.append(http[idx]) files = pd.DataFrame() for i in range(len(https)): res = requests.get(https[i]) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('._article_body_contents') files = files.append( pd.DataFrame( { 'Title': soup.find('div', attrs={ 'class': 'article_info' }).h3.text, 'Contents': re.sub( ' ', '', re.sub( ' ', '', re.sub( '\t', '', cleanText(body[0].text) [(cleanText(body[0].text)).find('{}') + 2:]))), 'link': https[i] }, index=[i])) text2 = files.Contents # 텍스트파일에 저장 csv files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(date2), index=False, encoding='utf-8') # ------------------------------------- # 사전만들기 from ckonlpy.tag import Twitter t = Twitter() t.add_dictionary(Main.sajun(), 'Noun') import nltk tokens_ko = [] for i in range(len(text2)): tokens_ko.append(t.nouns(text2[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) data_1 = [] for i in range(len(data)): for q in range(0, 1, 1): if len(data[i][0]) >= 2: data_1.append(data[i]) from wordcloud import WordCloud import matplotlib.pyplot as plt import time date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) tmp_data = dict(data_1) wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf', background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/naver_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
import matplotlib.pyplot as plt from ckonlpy.tag import Twitter from wordcloud import WordCloud from matplotlib import font_manager, rc from PIL import Image import numpy as np font_name = font_manager.FontProperties( fname="c:/Windows/Fonts/malgun.ttf").get_name() rc('font', family=font_name) tager = Twitter() for add in add_noun: tager.add_dictionary(add, 'Noun') text = open(keyword + '.txt', encoding='utf-8-sig').read() tokens = tager.pos(text) wordlist = [] for word in tokens: if word[1] in ['Noun']: if word[0] not in no_word: wordlist.append(word[0]) words = nltk.Text(wordlist, name=keyword) top_words = words.vocab().most_common(1000) words_dic = dict(top_words)
def kor_preprocessing(q, q3, df): data = df.copy().reset_index(drop=True) temp = [] data = data.str.join('').str.replace(r"\n", "") data = data.str.replace(pat=r'[^\w]', repl=r'', regex=True) for i in range(len(data)): okt = Okt() new = okt.normalize(data[i]) # 정규화 new = only_hangle(new) new = emoticon_normalize(new, num_repeats=2) # ㅋㅋㅋㅋㅋㅋ -> ㅋㅋ, ㅠㅠㅠㅠ -> ㅠㅠ data[i] = data[i].replace(" ", '') spacing = Spacing() new = spacing(data[i]) # Apply space preprocessing try: new = spell_checker.check(new).checked # 오타 처리 except: print(new) temp.append(new) data = pd.Series(temp) # 신조어 사전 추가 token = Twitter() # 추가 adding_noun = [ '식후감', '존맛', '개존맛', '꿀맛', '짱맛', '요기요', 'ㅈㅁㅌ', 'ㅃㄲ', '소확행', '민초', '치밥', '소맥', '넘사벽', '순삭', '빛삭', '광삭', '반반무', '반반무마니', '솔까말', '스압', '썸남', '썸녀', 'jmt', 'jmtg', 'jmtgr', 'JMT', 'JMTG', 'JMTGR', '배불띠', '돈쭐', '쿨타임', '닥추', '강추', '유튜버', '홧팅', '팟팅', '단짠단짠', '단짠', '맵단', '맵달', '맛도리', '부조캐', '밍밍쓰', '노맛', '존노맛', '최애', '차애', '섭스', '서빗', '프레젠또', '존맛탱', '개존맛탱', '존맛탱구리', '킹맛', '댕맛', '뿌링클', '로제', '오레오', '로투스', '사장님', '싸장님', '사장뉨' '소소한', '프라프치노', ' 프라푸치노', '갓성비', '커엽', '굳잡', '굿잡', '굳굳', '이벵트', '이벵' ] for i in adding_noun: token.add_dictionary(i, 'Noun') # 명사 추가 adding_verb = ['맛나', '마이쩡', '마이쪙', '마시땅', '마시쩡', '마시쪙'] for i in adding_verb: token.add_dictionary(i, 'Noun') # 동사 추가 token.add_dictionary('잘', 'Noun') # 동사 추가 token = Okt() # 불용어 사전 with open('stop.txt', 'rt', encoding='UTF8') as f: stopwords = f.read().replace('\n', ' ') stopwords = stopwords.split(' ') result = [] for i in range(len(data)): review = data[i] temp = (token.morphs(review, norm=True, stem=True)) stopwords_removed_sentence = [ word for word in temp if not word in stopwords ] # 불용어 제거 sentence = '' for s in stopwords_removed_sentence: sentence = sentence + ' ' + s result.append(sentence) q.put(result) q3.put(df)
def twitter(): cr_name = 'twitter' # 이미지파일 저장 장소 확인 save_path = os.path.join(Main.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(Main.img_path): os.mkdir(save_path) else: os.mkdir(Main.img_path) os.mkdir(save_path) text_save_path = os.path.join(Main.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(Main.text_path): os.mkdir(text_save_path) else: os.mkdir(Main.text_path) os.mkdir(text_save_path) import time import nltk keyword = Main.text() # 웹 셋팅 chrome = chromedriver.generate_chrome( driver_path=Main.driver_path, headless=Main.headless, download_path=Main.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Twitter 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword) chrome.get(url) time.sleep(3) # text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div > main > div > div > div > div > div > div:nth-child(2) > div') # for i in range(15): # for q in range(3): # body = chrome.find_element_by_css_selector('body') # body.send_keys(Keys.PAGE_DOWN) # time.sleep(1) # for ttt in tqdm(text2): # result.append(ttt.text) # time.sleep(1) # # # result2 = [] # for i in range(len(result)): # if i % 2 == 0: # result2.append(result[i]) # print(len(result2)) # # result3 = [] # for i in range(len(result2)): # result3.append(cleanText(result2[i])) body = chrome.find_element_by_css_selector('body') text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div') for i in range(10): for q in range(3): body.send_keys(Keys.PAGE_DOWN) time.sleep(1) for ttt in tqdm(text2): result.append(re.sub('\n', '', ttt.text)) t = Twitter() t.add_dictionary(Main.sajun(), 'Noun') tokens_ko = [] for i in range(len(result)): tokens_ko.append(t.nouns(result[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) # 텍스트파일에 댓글 저장하기 file = open(text_save_path+'/twitter{}.txt'.format(date2), 'w', encoding='utf-8') for review in result: file.write(review + '\n') file.close() tmp_data = dict(data) wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf', background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path+"/twitter_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
class Social_analysis(): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) def __init__(self): self.twitter = Twitter() def pickle_to_table(self, filename): with open(filename, 'rb') as f: data = pickle.load(f) data = data[1:] for idx, i in enumerate(data): data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map) data[idx][3] = '/'.join(i[3]) data[idx][4] = '/'.join(i[4]) self.raw_data = np.array(data) def hashtags_split(self, hashtags): hashtags_split = [] for i in hashtags: hashtags_split.append(i.split('/')) hashtags_list = [] for i in hashtags_split: temp = [] for j in i: if self.isHangul(j): t_hashtags = j.translate(self.non_bmp_map) temp.append(t_hashtags) hashtags_list.append(temp) self.hashtags_list = hashtags_list return hashtags_list def add_keyword_dic(self, keyword_list, tag='Noun'): for i in keyword_list: if type(i) == tuple: self.twitter.add_dictionary(i[0], i[1]) else: self.twitter.add_dictionary(i, tag) def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']): morph_list = [] noun_list = [] adj_list = [] verb_list = [] for j in text_list: parsed = self.twitter.pos(j) temp = [] n_temp = [] adj_temp = [] verb_temp = [] for i in parsed: if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): temp.append(i) if i[1] == 'Noun': n_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) morph_list.append(temp) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list = noun_list + adj_list + verb_list return morph_list, nav_list, noun_list, adj_list, verb_list def merge_list(self, tokenized_list): return [j for i in tokenized_list for j in i] def join_list(self, tokenized_list): joined_list = [] for idx, i in enumerate(tokenized_list): joined_list.append(" ".join(i)) return joined_list def split_list(self, untokenized_list): hashtag_splited = [] for idx, i in enumerate(untokenized): hashtag_splited.append(i.split('/')) return hastag_splited def word_substitute(self, dataset, sublist): dataset = copy.deepcopy(dataset) sub_book = dict() for i in sublist: for j in i['sub_words']: sub_book[j] = i['main'] gc.collect() for n, i in enumerate(dataset): dataset[n] = [sub_book.get(item, item) for item in i] del sub_book gc.collect() return dataset def word_delete(self, dataset, del_list): dataset = copy.deepcopy(dataset) for n, line in enumerate(dataset): dataset[n] = [i for i in line if i not in del_list] return dataset def isHangul(self, text): encText = text hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) return hanCount > 0
class Social_analysis(): non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) def __init__(self): self.twitter = Twitter() def pickle_to_table(self, filename): with open(filename, 'rb') as f: data = pickle.load(f) data = data[1:] for idx, i in enumerate(data): data[idx][2] = i[2].replace('#', ' ').translate(self.non_bmp_map) data[idx][3] = '/'.join(i[3]) data[idx][4] = '/'.join(i[4]) self.raw_data = np.array(data) def DB_to_table(self, DBname='intake', keyword='intake'): self.query = \ """ SELECT keyword, created_at, post_name, main_text, current_url FROM NaverBlogReview WHERE keyword = '{}' """.format(keyword) conn = pymssql.connect( "intakedb.c63elkxbiwfc.us-east-2.rds.amazonaws.com:1433", "gh", "ghintake", DBname) df = pdsql.read_sql_query(self.query, con=conn) # df['main_text'] = df.main_text.apply(lambda x: x.replace('#',' ').translate(self.non_bmp_map)) # df['created_at'] = df.created_at.apply(lambda x: x.strftime("%Y-%m-%d %H:%M:%S")) conn.close() self.raw_data = df.as_matrix() # def hashtags_split(self, hashtags): # hashtags_split = [] # for i in hashtags: # hashtags_split.append(i.split('/')) # # hashtags_list = [] # # for i in hashtags_split: # temp = [] # for j in i: # if self.isHangul(j): # t_hashtags = j.translate(self.non_bmp_map) # temp.append(t_hashtags) # hashtags_list.append(temp) # self.hashtags_list = hashtags_list # # return hashtags_list def add_keyword_dic(self, keyword_list, tag='Noun'): for i in keyword_list: if type(i) == tuple: self.twitter.add_dictionary(i[0], i[1]) else: self.twitter.add_dictionary(i, tag) def morph_pos(self, text_list, exception_list=['맛', '밥', '물', '몸']): morph_list = [] noun_list = [] adj_list = [] verb_list = [] for j in text_list: parsed = self.twitter.pos(j) temp = [] n_temp = [] adj_temp = [] verb_temp = [] for i in parsed: if self.isHangul(i[0]): if ((len(i[0]) > 1) or (i[0] in exception_list)): temp.append(i) if i[1] == 'Noun': n_temp.append(i[0]) elif i[1] == 'Verb': adj_temp.append(i[0]) elif i[1] == 'Adjective': verb_temp.append(i[0]) else: print('{} 제외'.format(i[0])) else: print('{} 한글이 아님.'.format(i[0])) morph_list.append(temp) noun_list.append(n_temp) adj_list.append(adj_temp) verb_list.append(verb_temp) nav_list = noun_list + adj_list + verb_list return morph_list, nav_list, noun_list, adj_list, verb_list def merge_list(self, tokenized_list): return [j for i in tokenized_list for j in i] def join_list(self, tokenized_list): joined_list = [] for idx, i in enumerate(tokenized_list): joined_list.append(" ".join(i)) return joined_list def split_list(self, untokenized_list): hashtag_splited = [] for idx, i in enumerate(untokenized): hashtag_splited.append(i.split('/')) return hastag_splited def word_substitute(self, dataset, sublist): dataset = copy.deepcopy(dataset) sub_book = dict() for i in sublist: for j in i['sub_words']: sub_book[j] = i['main'] gc.collect() for n, i in enumerate(dataset): dataset[n] = [sub_book.get(item, item) for item in i] del sub_book gc.collect() return dataset def word_delete(self, dataset, del_list): dataset = copy.deepcopy(dataset) for n, line in enumerate(dataset): dataset[n] = [i for i in line if i not in del_list] return dataset def isHangul(self, text): encText = text hanCount = len(re.findall(u'[\u3130-\u318F\uAC00-\uD7A3]+', encText)) return hanCount > 0
train_stories[3572] train_questions[3572] train_answers[3572] twitter = Twitter() print(twitter.morphs('은경이는 화장실로 이동했습니다.')) print(twitter.morphs('경임이는 정원으로 가버렸습니다.')) print(twitter.morphs('수종이는 복도로 뛰어갔습니다.')) print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.')) print(twitter.morphs('수종이는 사무실로 갔습니다.')) print(twitter.morphs('은경이는 침실로 갔습니다.')) twitter.add_dictionary('은경이', 'Noun') twitter.add_dictionary('경임이', 'Noun') twitter.add_dictionary('수종이', 'Noun') print(twitter.morphs('은경이는 화장실로 이동했습니다.')) print(twitter.morphs('경임이는 정원으로 가버렸습니다.')) print(twitter.morphs('수종이는 복도로 뛰어갔습니다.')) print(twitter.morphs('필웅이는 부엌으로 복귀했습니다.')) print(twitter.morphs('수종이는 사무실로 갔습니다.')) print(twitter.morphs('은경이는 침실로 갔습니다.')) def preprocess_data(train_data, test_data): counter = FreqDist() # 두 문장의 story를 하나의 문장으로 통합하는 함수 flatten = lambda data: reduce(lambda x, y: x + y, data)
from konlpy.tag import Komoran from konlpy.tag import Okt from ckonlpy.tag import Twitter okt = Okt() twitter = Twitter() sentence = 'IBK기업은행 ' sentences = '소은지국민은행계좌로30만원이체해줘' komoran = Komoran() twitter.add_dictionary('이체해줘', 'Noun') twitter.add_dictionary('KB 국민은행', 'Noun') komoran = Komoran(userdic="C:/Users/ADMIN/Desktop/dic.txt") print(twitter.pos(sentence, stem=True)) print(twitter.pos(sentences, stem=True)) print(komoran.pos(sentence)) print(komoran.pos(sentences)) arr = komoran.pos(sentence) for word, tag in arr: if (tag == 'VV'): print("|||||||") print(word, tag) if (tag == 'JKO' or tag == 'JKB' or tag == 'JKS'): print("|||||||") brr = komoran.pos(sentences) for word, tag in brr: if (tag == 'VV' or tag == 'XSV'):