def run_twitter(news): twitter = Twitter() start_time = time.time() print('twitter 시작') # twitter_morphs = twitter.morphs(news) twitter_nouns = twitter.nouns(news) # twitter_pos = twitter.pos(news) end_time = time.time() # print(twitter_pos) print('twitter 끝 - %s 초' % str(end_time - start_time)) with open('twitter_noun.txt', 'w', encoding='utf-8') as fstream: # fstream.write('twitter time : %s s\n' % str(end_time - start_time) ) # fstream.write('twitter_morphs\n') # write_list(twitter_morphs, fstream) # fstream.write('\n\n') # fstream.write('twitter_nouns\n') write_list(twitter_nouns, fstream) fstream.write('\n\n')
import time import string import datetime import csv from ckonlpy.tag import Twitter from selenium import webdriver from bs4 import BeautifulSoup driver = webdriver.Chrome("c:/Users/yooat/Downloads/chromedriver/chromedriver") driver.get('http://www.cheonan.go.kr/covid19/sub02_01.do') time.sleep(1) twitter = Twitter() html = driver.page_source soup = BeautifulSoup(html, 'html.parser') name = soup.find_all("dl",class_="item") f1 = open('corona.txt','w+t') for test in name: if "14일이" in test.get_text(): continue f1.write(test.get_text() + "\n") f1.close(); f1 = open('corona.txt','r') nowDate = datetime.datetime.now() c = csv.writer(open(nowDate.strftime("result_" + "%Y-%m-%d_%H-%M-%S") + ".csv","w",encoding="cp949")) for l in f1: c.writerow(twitter.nouns(l)) time.sleep(3)
class PreprocessingText: def help(self): print("******PreprocessingText******") print("1) make_content_re(df['컬럼이름'](Series)) : 입력받은 열을 전처리 후 시리즈로 반환") print("2) add_noun_dict('list') : 명사 사전에 단어 추가") print("3) add_stopwords('list') : 불용어 사전에 단어 추가") print("4) tokenize(df['컬럼이름'](Series)) : 입력받은 열을 토큰화한 후 시리즈로 반환") print( "5) change_similar_words(토큰화된 문서(Series), 유의어 사전(dictionary)) : 유의어 사전을 기반으로 문서 내 유의어를 대표어로 변환하고, 변환된 문서를 시리즈로 반환한다." ) print("*****************************") def __init__(self): self.reg_reporter = re.compile('[가-힣]+\s[가-힣]*기자') # 기자 self.reg_email = re.compile( '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$') # 이메일 self.reg_eng = re.compile('[a-z]+') # 소문자 알파벳, 이메일 제거용, 대문자는 남겨둔다 self.reg_chi = re.compile("[\u4e00-\u9fff]+") # 한자 self.reg_sc = re.compile( "·|…|◆+|◇+|▶+|●+|▲+|“|”|‘|’|\"|\'|\(|\)|\W+") # 특수문자 self.reg_date = re.compile( '\d+일|\d+월|\d+년|\d+시|\d+분|\(현지시간\)|\(현지시각\)|\d+') # 날짜,시간,숫자 self.twitter_obj = Twitter() self.stopwords = [] self.noun_list = [] def preprocessing(self, doc): tmp = re.sub(self.reg_reporter, '', doc) tmp = re.sub(self.reg_email, '', tmp) tmp = re.sub(self.reg_eng, '', tmp) tmp = re.sub(self.reg_chi, '', tmp) tmp = re.sub(self.reg_sc, ' ', tmp) tmp = re.sub(self.reg_date, '', tmp) return tmp def make_content_re(self, data): pp_data = data.apply(self.preprocessing) return pp_data def add_noun_dict(self, noun_list): self.twitter_obj.add_dictionary(noun_list, 'Noun') self.noun_list.extend(noun_list) print("추가한 명사") print(noun_list) def add_stopwords(self, stopword_list): self.stopwords.extend(stopword_list) print("추가한 불용어") print(stopword_list) def change_similar_words(self, tokenized_docs, similar_words_dict): changed_docs = [] for doc in tokenized_docs: changed_doc = [] for word in doc: if word in similar_words_dict.keys(): changed_doc.append(similar_words_dict[word]) else: changed_doc.append(word) changed_docs.append(changed_doc) return changed_docs def tokenize(self, data): print('추가한 명사:', self.noun_list) print('불용어: ', self.stopwords) tokenized_doc = data.apply(lambda x: self.twitter_obj.nouns(x)) tokenized_doc_without_stopwords = tokenized_doc.apply( lambda x: [item.lower() for item in x if item not in self.stopwords]) tokenized_data = tokenized_doc_without_stopwords return pd.Series(tokenized_data)
def naver(): from selenium import webdriver import re from selenium.webdriver.common.keys import Keys import time cr_name = 'naver' # 이미지파일 저장 장소 확인 save_path = os.path.join(Main.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(Main.img_path): os.mkdir(save_path) else: os.mkdir(Main.img_path) os.mkdir(save_path) text_save_path = os.path.join(Main.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(Main.text_path): os.mkdir(text_save_path) else: os.mkdir(Main.text_path) os.mkdir(text_save_path) # 네이버 헤드라인 가져오는소스 date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) result = [] res = [] # 웹 셋팅 chrome = chromedriver.generate_chrome(driver_path=Main.driver_path, headless=Main.headless, download_path=Main.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Naver 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format( date) chrome.get(url) time.sleep(2) # scroll(3) for sun in range(4, 10): pr = chrome.find_elements_by_xpath( '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun)) for p in pr: result.append(p.find_elements_by_tag_name('a')) # print(result) for i, q in enumerate(result): for e in q: res.append(e.get_attribute('href')) http = list(set(res)) len(http) https = [] for idx in range(len(http)): if http[idx].find('popularDay') >= 0: continue else: https.append(http[idx]) files = pd.DataFrame() for i in range(len(https)): res = requests.get(https[i]) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('._article_body_contents') files = files.append( pd.DataFrame( { 'Title': soup.find('div', attrs={ 'class': 'article_info' }).h3.text, 'Contents': re.sub( ' ', '', re.sub( ' ', '', re.sub( '\t', '', cleanText(body[0].text) [(cleanText(body[0].text)).find('{}') + 2:]))), 'link': https[i] }, index=[i])) text2 = files.Contents # 텍스트파일에 저장 csv files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(date2), index=False, encoding='utf-8') # ------------------------------------- # 사전만들기 from ckonlpy.tag import Twitter t = Twitter() t.add_dictionary(Main.sajun(), 'Noun') import nltk tokens_ko = [] for i in range(len(text2)): tokens_ko.append(t.nouns(text2[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) data_1 = [] for i in range(len(data)): for q in range(0, 1, 1): if len(data[i][0]) >= 2: data_1.append(data[i]) from wordcloud import WordCloud import matplotlib.pyplot as plt import time date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) tmp_data = dict(data_1) wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf', background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/naver_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
def comm_date(comm_name, dates_array): for dates in dates_array: client = MongoClient('mongodb://*****:*****@\·\"\"\%\,\(\)\&]+', ' ', text) text = re.sub('[\n\xa0\r]+', ' ', text) # 토큰화 token = twitter.nouns(text) # 명사만 if token != []: tokened_texts.extend(token) print(dates, i, '/', len(idate_with_all)) pickle_name = str(comm_name) + str(dates) with open(pickle_name, "wb") as fw: pickle.dump(tokened_texts, fw) print('저장완료')
def twitter(): cr_name = 'twitter' # 이미지파일 저장 장소 확인 save_path = os.path.join(Main.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(Main.img_path): os.mkdir(save_path) else: os.mkdir(Main.img_path) os.mkdir(save_path) text_save_path = os.path.join(Main.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(Main.text_path): os.mkdir(text_save_path) else: os.mkdir(Main.text_path) os.mkdir(text_save_path) import time import nltk keyword = Main.text() # 웹 셋팅 chrome = chromedriver.generate_chrome( driver_path=Main.driver_path, headless=Main.headless, download_path=Main.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Twitter 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword) chrome.get(url) time.sleep(3) # text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div > main > div > div > div > div > div > div:nth-child(2) > div') # for i in range(15): # for q in range(3): # body = chrome.find_element_by_css_selector('body') # body.send_keys(Keys.PAGE_DOWN) # time.sleep(1) # for ttt in tqdm(text2): # result.append(ttt.text) # time.sleep(1) # # # result2 = [] # for i in range(len(result)): # if i % 2 == 0: # result2.append(result[i]) # print(len(result2)) # # result3 = [] # for i in range(len(result2)): # result3.append(cleanText(result2[i])) body = chrome.find_element_by_css_selector('body') text2 = chrome.find_elements_by_css_selector('#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div') for i in range(10): for q in range(3): body.send_keys(Keys.PAGE_DOWN) time.sleep(1) for ttt in tqdm(text2): result.append(re.sub('\n', '', ttt.text)) t = Twitter() t.add_dictionary(Main.sajun(), 'Noun') tokens_ko = [] for i in range(len(result)): tokens_ko.append(t.nouns(result[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) date = time.strftime('%Y%m%d', time.localtime(time.time())) date2 = time.strftime('%Y%m%d_%H%M', time.localtime(time.time())) # 텍스트파일에 댓글 저장하기 file = open(text_save_path+'/twitter{}.txt'.format(date2), 'w', encoding='utf-8') for review in result: file.write(review + '\n') file.close() tmp_data = dict(data) wordcloud = WordCloud(font_path='/Library/Fonts/NanumMyeongjo.ttf', background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path+"/twitter_{}.png".format(date), bbox_inces='tight', dpi=400, pad_inches=0)
def Daum(self): cr_name = 'daum' # 이미지파일 저장 장소 확인 save_path = os.path.join(self.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(self.img_path): os.mkdir(save_path) else: os.mkdir(self.img_path) os.mkdir(save_path) text_save_path = os.path.join(self.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(self.text_path): os.mkdir(text_save_path) else: os.mkdir(self.text_path) os.mkdir(text_save_path) # 다음뉴스 헤드라인 긁어오기 http = [] print('Daum 접속 중') httz = 'https://media.daum.net/ranking/popular/?regDate={}'.format( self.date) res = requests.get(httz) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('#mArticle > div.rank_news > ul.list_news2') body = body[0].find_all('a') for i in range(len(body)): t = body[i].get('href') http.append(t) # 중복제거 http = list(set(http)) files = pd.DataFrame() for i in range(len(http)): res = requests.get(http[i]) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('.article_view')[0] files = files.append( pd.DataFrame( { 'Title': soup.find('div', attrs={ 'class': 'head_view' }).h3.text, 'Contents': " ".join(p.get_text() for p in body.find_all('p')), 'link': http[i] }, index=[i])) text2 = files.Contents # 텍스트파일에 댓글 저장하기 files.to_csv(text_save_path + '/다음뉴스종합_{}.csv'.format(self.date), index=False, encoding='utf-8') print('다음 텍스트 저장완료!') t = Twitter() t.add_dictionary(self.sajun(), 'Noun') print('형태소 사전 업로드 완료!!') tokens_ko = [] for i in range(len(text2)): tokens_ko.append(t.nouns(text2[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) print('nltk 완료') # 다음뉴스는 50페이지 긁어오는거라서 1글자는 삭제했음. 필요한건 바로바로 보고서 사전에 추가해서 태깅 다시해야함. data_1 = [] for i in range(len(data)): for q in range(0, 1, 1): if len(data[i][0]) >= 2: data_1.append(data[i]) tmp_data = dict(data_1) print('wordcloud 실행') wordcloud = WordCloud( font_path=self.fontPath, background_color='white', max_words=230).generate_from_frequencies(tmp_data) print('wordcloud 실행!!!') plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/daum_{}.png".format(self.date), bbox_inces='tight', dpi=400, pad_inches=0)
def Naver(self): cr_name = 'naver' # 이미지파일 저장 장소 확인 save_path = os.path.join(self.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(self.img_path): os.mkdir(save_path) else: os.mkdir(self.img_path) os.mkdir(save_path) text_save_path = os.path.join(self.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(self.text_path): os.mkdir(text_save_path) else: os.mkdir(self.text_path) os.mkdir(text_save_path) # 네이버 헤드라인 가져오는소스 result = [] res = [] # 웹 셋팅 if self.platform == 'linux': display = Display(visible=0, size=(800, 600)) display.start() options = Options() options.binary_location = "/usr/bin/google-chrome" # chrome_options = webdriver.ChromeOptions() options.headless = True options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') chrome = webdriver.Chrome(executable_path=self.driver_path, options=options) else: chrome = self.generate_chrome(driver_path=self.driver_path, headless=self.headless, download_path=self.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Naver 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://news.naver.com/main/ranking/popularDay.nhn?rankingType=popular_day&date={}'.format( self.date) chrome.get(url) chrome.implicitly_wait(30) # scroll(3) for sun in range(4, 10): pr = chrome.find_elements_by_xpath( '//*[@id="wrap"]/table/tbody/tr/td[2]/div/div[{}]'.format(sun)) for p in pr: result.append(p.find_elements_by_tag_name('a')) # print(result) for i, q in enumerate(result): for e in q: res.append(e.get_attribute('href')) http = list(set(res)) len(http) https = [] for idx in range(len(http)): if http[idx].find('popularDay') >= 0: continue else: https.append(http[idx]) files = pd.DataFrame() if self.platform == 'linux': chrome.close() display.stop() for i in range(len(https)): res = requests.get(https[i]) soup = BeautifulSoup(res.content, 'html.parser') body = soup.select('._article_body_contents') files = files.append( pd.DataFrame( { 'Title': soup.find('div', attrs={ 'class': 'article_info' }).h3.text, 'Contents': re.sub( ' ', '', re.sub( ' ', '', re.sub( '\t', '', self.cleanText(body[0].text)[ (self.cleanText(body[0].text) ).find('{}') + 2:]))), 'link': https[i] }, index=[i])) text2 = files.Contents # 텍스트파일에 저장 csv files.to_csv(text_save_path + '/네이버종합뉴스_{}.csv'.format(self.date), index=False, encoding='utf-8') # ------------------------------------- # 사전만들기 t = Twitter() t.add_dictionary(self.sajun(), 'Noun') tokens_ko = [] for i in range(len(text2)): tokens_ko.append(t.nouns(text2[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) data_1 = [] for i in range(len(data)): for q in range(0, 1, 1): if len(data[i][0]) >= 2: data_1.append(data[i]) tmp_data = dict(data_1) wordcloud = WordCloud( font_path=self.fontPath, background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/naver_{}.png".format(self.date), bbox_inces='tight', dpi=400, pad_inches=0)
def twitter(self): cr_name = 'twitter' # 이미지파일 저장 장소 확인 save_path = os.path.join(self.img_path, cr_name) if os.path.isdir(save_path): print(cr_name + ' 이미지 경로 확인 완료') elif os.path.isdir(self.img_path): os.mkdir(save_path) else: os.mkdir(self.img_path) os.mkdir(save_path) text_save_path = os.path.join(self.text_path, cr_name) if os.path.isdir(text_save_path): print(cr_name + ' 텍스트 경로 확인 완료') elif os.path.isdir(self.text_path): os.mkdir(text_save_path) else: os.mkdir(self.text_path) os.mkdir(text_save_path) keyword = self.scan_name # if self.platform == 'linux': # print('System platform : Linux') # self.driver_path = './static/lib/webDriver/chromedriver_lnx' # from pyvirtualdisplay import Display # self.display = Display(visible=0, size=(800, 600)) # self.display.start() # 웹 셋팅 if self.platform == 'linux': display = Display(visible=0, size=(1024, 768)) display.start() options = Options() options.binary_location = "/usr/bin/google-chrome" # chrome_options = webdriver.ChromeOptions() options.headless = True options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') chrome = webdriver.Chrome(executable_path=self.driver_path, options=options) else: chrome = self.generate_chrome(driver_path=self.driver_path, headless=self.headless, download_path=self.DOWNLOAD_DIR) # 웹접속 - 네이버 이미지 접속 print("Twitter 접속중") # driver = webdriver.Chrome(executable_path="./chromedriver.exe") # driver.implicitly_wait(30) url = 'https://twitter.com/search?q={}&src=typed_query'.format(keyword) chrome.get(url) chrome.implicitly_wait(30) body = chrome.find_element_by_css_selector('body') text2 = chrome.find_elements_by_css_selector( '#react-root > div > div > div.css-1dbjc4n.r-18u37iz.r-13qz1uu.r-417010 > main > div > div > div > div > div > div:nth-child(2) > div > div > section > div' ) result = [] for i in range(10): for q in range(3): body.send_keys(Keys.PAGE_DOWN) time.sleep(1) for ttt in text2: result.append(re.sub('\n', '', ttt.text)) print(result) time.sleep(1) if self.platform == 'linux': chrome.close() display.stop() t = Twitter() t.add_dictionary(self.sajun(), 'Noun') print('단어사전 추출완료') tokens_ko = [] for i in range(len(result)): tokens_ko.append(t.nouns(result[i])) final = [] for _, q in enumerate(tokens_ko): for i in range(len(q)): final.insert(-1, q[i]) print('형태소분석 완료!') ko = nltk.Text(final, name="첫번째") data = ko.vocab().most_common(1000) # 텍스트파일에 댓글 저장하기 file = open(text_save_path + '/twitter{}.txt'.format(self.date), 'w', encoding='utf-8') for review in result: file.write(review + '\n') file.close() tmp_data = dict(data) wordcloud = WordCloud( font_path=self.fontPath, background_color='white', max_words=230).generate_from_frequencies(tmp_data) plt.figure(figsize=(10, 8)) plt.imshow(wordcloud) plt.axis('off'), plt.xticks([]), plt.yticks([]) plt.tight_layout() plt.subplots_adjust(left=0, bottom=0, right=1, top=1, hspace=0, wspace=0) plt.savefig(save_path + "/twitter_{}.png".format(self.date), bbox_inces='tight', dpi=400, pad_inches=0)