def press_crawling(self, oid=215, aid=20): headers = {'User-Agent': 'Mozilla/5.0'} writer = Writer_press(category_name="aid123") url = 'https://sports.news.naver.com/news.nhn?' oid = 'oid=' + str(oid) #여기서 입력받은 oid(언론사 id)를 지정할 거에요 for i in tqdm(range(1, aid), desc="Crawling rate", mininterval=0.01): #print(i) aid = str(i) aid_length = len(aid) aid = '&aid=' + '0' * (10 - aid_length) + aid url1 = url + oid + aid b = requests.get(url1, headers=headers) #print(url1) document = BeautifulSoup(b.content, 'html.parser') tag_content = document.find_all('div', {'id': 'newsEndContents'}) text_sentence = '' text_sentence = text_sentence + ArticleParser.clear_content( str(tag_content[0].find_all(text=True))) #print(text_sentence) headline = '' tag_headline = document.find_all('h4', {'class': 'title'}) headline = headline + ArticleParser.clear_headline( str(tag_headline[0].find_all(text=True))) article_info = document.find_all('div', {'class': 'info'}) #여기서 article info 정제 작업을 하며 좋을 것 같아요. writer.wcsv.writerow([headline, text_sentence, url1]) print() writer.close() '''
def Keyword_crawling(self, keyword): headers = {'User-Agent': 'Mozilla/5.0'} url = 'https://search.naver.com/search.naver?sm=tab_hty.top&where=news&query=' url = url + keyword list_a = [] for i in tqdm(range(0, 20), desc="append url list", mininterval=0.01): s_num = "&start=" + str(i) + "1" b = requests.get(url + s_num, headers=headers) document = BeautifulSoup(b.content, 'html.parser') writer = Writer_press(category_name='_' + keyword, text_c="Keyword_crawling") list_url = document.select('.list_news .info') for line in list_url: list_a.append(line.get('href')) #print(line.get('href')) print('') list_b = [] list_c = [] for line in list_a: if (line == None): continue elif (line.find('naver') != -1): list_b.append(line) for line in list_b: fnum = line.find('oid') if len(line[fnum:]) == 22: list_c.append(line[fnum:]) url = 'https://sports.news.naver.com/news.nhn?' list_num = len(list_c) print(list_c) for i in tqdm(range(0, list_num), desc="Crawling rate", mininterval=0.01): print('') url_a = url + list_c[i] b = requests.get(url_a, headers=headers) document = BeautifulSoup(b.content, 'html.parser') tag_content = document.find_all('div', {'id': 'newsEndContents'}) if len(tag_content) != 0: text_sentence = '' text_sentence = text_sentence + ArticleParser.clear_content( str(tag_content[0].find_all(text=True))) headline = '' tag_headline = document.find_all('h4', {'class': 'title'}) if (len(tag_headline) != 0): headline = headline + ArticleParser.clear_headline( str(tag_headline[0].find_all(text=True))) if headline == '': headline = '-' article_info = document.find_all('div', {'class': 'info'}) writer.wcsv.writerow( [headline + '\t' + text_sentence + '\t' + url_a])
def __init__(self): self.parser = ArticleParser() self.category = { '정치': 100, '경제': 101, '사회': 102, '생활문화': 103, '세계': 104, 'IT과학': 105 } self.selected_category = [] self.date = {'start_year': 0, 'end_year': 0, 'end_month': 0}
def press_crawling(self, oid, aid, name): headers = {'User-Agent': 'Mozilla/5.0'} url = 'https://sports.news.naver.com/news.nhn?' writer = Writer_press(category_name=str(aid), text_c=name) self.writer = writer oid = 'oid=' + oid for i in tqdm(range(1, aid), desc="Crawling rate", mininterval=0.01): self.num += 1 aid = str(i) aid_length = len(aid) aid = '&aid=' + '0' * (10 - aid_length) + aid url1 = url + oid + aid b = requests.get(url1, headers=headers) #print(url1) document = BeautifulSoup(b.content, 'html.parser') tag_content = document.find_all('div', {'id': 'newsEndContents'}) if len(tag_content) != 0: #print(url1) text_sentence = '' text_sentence = text_sentence + ArticleParser.clear_content( str(tag_content[0].find_all(text=True))) #print(text_sentence) headline = '' tag_headline = document.find_all('h4', {'class': 'title'}) headline = headline + ArticleParser.clear_headline( str(tag_headline[0].find_all(text=True))) article_info = document.find_all('div', {'class': 'info'}) #여기서 article info 정제 작업을 하며 좋을 것 같아요. #==================================================== #기사 입력 시간 정보 받기 input_time = str(article_info[0]) iTime = "" iTime = self.inputTime(input_time) #print("\n") #print(iTime) #=================================================== if headline == '': headline = '-' writer.wcsv.writerow([ headline + '\t' + text_sentence + '\t' + url1 + '\t' + iTime ]) print() writer.close() '''
def get_sentence_from_document(self, document): tag_content = document.find_all('div', {'id': 'articleBodyContents'}) text_sentence = '' # 뉴스 기사 본문 초기화 text_sentence = text_sentence + ArticleParser.clear_content( str(tag_content[0].find_all(text=True))) if not text_sentence or len(text_sentence) < 500: # 공백일 경우 기사 제외 처리 return None else: return text_sentence
def get_headline_from_document(self, document): tag_headline = document.find_all('h3', {'id': 'articleTitle'}, {'class': 'tts_head'}) text_headline = '' # 뉴스 기사 제목 초기화 text_headline = text_headline + ArticleParser.clear_headline( str(tag_headline[0].find_all(text=True))) if not text_headline: # 공백일 경우 기사 제외 처리 return None else: return text_headline
def get_news_title(n_url): breq = requests.get(n_url) bsoup = BeautifulSoup(breq.content, 'html.parser') get_title_raw = bsoup.select( 'h3#articleTitle')[0].text # 대괄호는 h3#articleTitle 인 것중 첫번째 그룹만 가져오겠다. get_title = '' get_title = get_title + ArticleParser.clear_headline(get_title_raw) if not get_title: return None return get_title
def make_news_page_url(category_url, year, month, day): made_urls = [] if len(str(month)) == 1: month = "0" + str(month) if len(str(day)) == 1: day = "0" + str(day) url = category_url + str(year) + str(month) + str(day) totalpage = ArticleParser.find_news_totalpage(url + "&page=10000") print(totalpage) for page in range(1, totalpage + 1): made_urls.append(url + "&page=" + str(page)) return made_urls
def __init__(self): self.parser = ArticleParser() self.categories = { '야구': "kbaseball", '해외야구': "wbaseball", '축구': "kfootball", '해외축구': "wfootball", '농구': "basketball", '배구': "volleyball", '골프': "golf", '일반': "general", 'e스포츠': "esports" } self.selected_categories = [] self.date = { 'start_year': 0, 'start_month': 0, 'start_day': 0, 'end_year': 0, 'end_month': 0, 'end_day': 0 } self.user_operating_system = str(platform.system())
def make_news_page_url(category_url, start_year, end_year, start_month, end_month, start_day, end_day): made_urls = [] for year in range(start_year, end_year + 1): if start_year == end_year: year_startmonth = start_month year_endmonth = end_month else: if year == start_year: year_startmonth = start_month year_endmonth = 12 elif year == end_year: year_startmonth = 1 year_endmonth = end_month else: year_startmonth = 1 year_endmonth = 12 for month in range(year_startmonth, year_endmonth + 1): if start_month == end_month: month_startday = start_day month_endday = end_day else: if month == start_month: month_startday = start_day month_endday = calendar.monthrange(year, month)[1] elif month == end_month: month_startday = 1 month_endday = end_day else: month_startday = 1 month_endday = calendar.monthrange(year, month)[1] for day in range(month_startday, month_endday + 1): if len(str(month)) == 1: month = "0" + str(month) if len(str(day)) == 1: day = "0" + str(day) # 날짜별로 Page Url 생성 url = category_url + str(year) + str(month) + str(day) # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄 # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect) totalpage = ArticleParser.find_news_totalpage(url + "&page=10000") for page in range(1, totalpage + 1): made_urls.append(url + "&page=" + str(page)) return made_urls
def get_news_content(n_url): breq = requests.get(n_url) bsoup = BeautifulSoup(breq.content, 'html.parser') _text = bsoup.select('#articleBodyContents')[0].get_text().replace( '\n', " ") btext = _text.replace( "// flash 오류를 우회하기 위한 함수 추가 function _flash_removeCallback() {}", "") get_content = '' get_content = get_content + ArticleParser.clear_content(btext) if not get_content: return None return get_content
def make_news_page_url(category_url, start_year, end_year, start_month, end_month): print("here is make_news_page_url") made_urls = [] for year in range(start_year, end_year + 1): print("start_year , end_year = " + " " + str(start_year) + " " + str(end_year)) print("start_month , end_month = " + " " + str(start_month) + " " + str(end_month)) if start_year == end_year: year_startmonth = start_month year_endmonth = end_month else: if year == start_year: year_startmonth = start_month year_endmonth = 12 elif year == end_year: year_startmonth = 1 year_endmonth = end_month else: year_startmonth = 1 year_endmonth = 12 for month in range(year_startmonth, year_endmonth + 1): for month_day in range(1, calendar.monthrange(year, month)[1] + 1): # for month_day in range(1, 2): print("month, day = " + str(month) + " " + str(month_day)) if len(str(month)) == 1: month = "0" + str(month) if len(str(month_day)) == 1: month_day = "0" + str(month_day) # 날짜별로 Page Url 생성 url = category_url + str(year) + str(month) + str( month_day) print("url = " + url) # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄 # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect) totalpage = ArticleParser.find_news_totalpage( url + "&page=10000") print("totalpage = " + str(totalpage)) for page in range(1, totalpage + 1): made_urls.append(url + "&page=" + str(page)) return made_urls
def make_news_page_url(self, category_url, start_year, end_year, start_month, end_month): for year in range(start_year, end_year + 1): if start_year == end_year: year_startmonth = start_month year_endmonth = end_month else: if year == start_year: year_startmonth = start_month year_endmonth = 12 elif year == end_year: year_startmonth = 1 year_endmonth = end_month else: year_startmonth = 1 year_endmonth = 12 for month in tqdm(range(year_startmonth, year_endmonth + 1), desc="MakeUrl rate", mininterval=0.01): print('\n') for month_day in range(1, calendar.monthrange(year, month)[1] + 1): if len(str(month)) == 1: month = "0" + str(month) if len(str(month_day)) == 1: month_day = "0" + str(month_day) # 날짜별로 Page Url 생성 url = category_url + str(year) + str(month) + str( month_day) # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄 # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect) totalpage = ArticleParser.find_news_totalpage( url + "&page=10000") for page in range(1, totalpage + 1): self.made_urls.append(url + "&page=" + str(page)) print("url개수: " + str(len(self.made_urls))) return self.made_urls
logging.basicConfig(format=u'[%(asctime)s] # %(levelname)-8s [%(filename)s] %(message)s', filename="web_parser.log", level=logging.INFO) def readBaseDataSetFromFile(path): with open(path, encoding='UTF8') as json_file: data = json.load(json_file) return data if __name__ == "__main__": # from nltk.tokenize import sent_tokenize # разбивает на предложения # logging.info('Program started') article_parser = ArticleParser("mongodb://*****:*****@185.246.152.112/daryana") # article_parser.createDefaultSet(readBaseDataSetFromFile("meanings.json")) # article_parser.selfTeaching(2000, 200) # article_parser.resetMeanings() # article_parser.setMeanings() article_parser.classify() generateImgs() # generateImgs("first_def_set.json", "first_") # pikabu_urls = [ # "https://pikabu.ru/tag/iphone", # "https://pikabu.ru/tag/apple", # "https://pikabu.ru/tag/ios", # "https://pikabu.ru/tag/macos",
class ArticleCrawler(object): def __init__(self): self.parser = ArticleParser() self.categories = {'정치': 100, '경제': 101, '사회': 102, '생활문화': 103, 'IT과학': 105} self.selected_categories = [] self.date = {'start_year': 0, 'end_year': 0, 'end_month': 0} def set_category(self, *args): for key in args: if self.categories.get(key) is None: raise InvalidCategory(key) self.selected_categories = args def set_date_range(self, start_year, end_year, end_month): args = [start_year, end_year, end_month] if start_year > end_year: raise InvalidYear(start_year, end_year) if end_month < 1 or end_month > 12: raise InvalidMonth(end_month) for key, date in zip(self.date, args): self.date[key] = date print(self.date) def make_news_page_url(self, category_url, start_year, last_year, start_month, last_month): maked_url = [] final_startmonth = start_month final_lastmonth = last_month for year in range(start_year, last_year + 1): if year != last_year: start_month = 1 last_month = 12 else: start_month = final_startmonth last_month = final_lastmonth for month in range(start_month, last_month + 1): for month_day in range(1, calendar.monthrange(year, month)[1] + 1): url = category_url if len(str(month)) == 1: month = "0" + str(month) if len(str(month_day)) == 1: month_day = "0" + str(month_day) url = url + str(year) + str(month) + str(month_day) final_url = url # page 날짜 정보만 있고 page 정보가 없는 url 임시 저장 # totalpage는 네이버 페이지 구조를 이용해서 page=1000으로 지정해 totalpage를 알아냄 # page=1000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 totalpage = self.parser.find_news_totalpage(final_url + "&page=1000") for page in range(1, totalpage + 1): url = final_url # url page 초기화 url = url + "&page=" + str(page) maked_url.append(url) return maked_url def crawling(self, category_name): # MultiThread PID print(category_name + " PID: " + str(os.getpid())) # 각 카테고리 기사 저장 할 CSV file = open('Article_' + category_name + '.csv', 'w', encoding='euc_kr', newline='') wcsv = csv.writer(file) # 기사 URL 형식 url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str( self.categories.get(category_name)) + "&date=" # start_year년 1월 ~ end_year의 end_mpnth 날짜까지 기사를 수집합니다. final_urlday = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], 1, self.date['end_month']) print(category_name + " Urls are generated") print("The crawler starts") for URL in final_urlday: regex = re.compile("date=(\d+)") news_date = regex.findall(URL)[0] request = requests.get(URL) document = BeautifulSoup(request.content, 'html.parser') tag_document = document.find_all('dt', {'class': 'photo'}) post = [] for tag in tag_document: post.append(tag.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음 for content_url in post: # 기사 URL # 크롤링 대기 시간 sleep(0.01) # 기사 HTML 가져옴 request_content = requests.get(content_url) document_content = BeautifulSoup(request_content.content, 'html.parser') try: # 기사 제목 가져옴 tag_headline = document_content.find_all('h3', {'id': 'articleTitle'}, {'class': 'tts_head'}) text_headline = '' # 뉴스 기사 제목 초기화 text_headline = text_headline + self.parser.clear_headline(str(tag_headline[0].find_all(text=True))) if not text_headline: # 공백일 경우 기사 제외 처리 continue # 기사 본문 가져옴 tag_content = document_content.find_all('div', {'id': 'articleBodyContents'}) text_sentence = '' # 뉴스 기사 본문 초기화 text_sentence = text_sentence + self.parser.clear_content(str(tag_content[0].find_all(text=True))) if not text_sentence: # 공백일 경우 기사 제외 처리 continue # 기사 언론사 가져옴 tag_company = document_content.find_all('meta', {'property': 'me2:category1'}) text_company = '' # 언론사 초기화 text_company = text_company + str(tag_company[0].get('content')) if not text_company: # 공백일 경우 기사 제외 처리 continue # CSV 작성 wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url]) except Exception as ex: # UnicodeEncodeError .. pass file.close() def start(self): # MultiThread 크롤링 시작 for category_name in self.selected_categories: proc = Process(target=self.crawling, args=(category_name,)) proc.start()
import requests from bs4 import BeautifulSoup from articleparser import ArticleParser headers = {'User-Agent': 'Mozilla/5.0'} b = requests.get( 'https://sports.news.naver.com/news.nhn?oid=215&aid=0000918970', headers=headers) document = BeautifulSoup(b.content, 'html.parser') tag_content = document.find_all('div', {'id': 'newsEndContents'}) text_sentence = '' text_sentence = text_sentence + ArticleParser.clear_content( str(tag_content[0].find_all(text=True))) headline = '' headline = headline + ArticleParser.clear_headline( str(document.find_all('h4', {'class': 'title'}))) article_info = document.find_all('div', {'class': 'info'}) # text_sentence는 기사 본문을 저장한 값 # headline은 기사 제목을 저장한 값 ''' article_info는 기사 정보를 저장한 값 [<div class="info"> <span>기사입력 2020.11.30. 오전 08:05</span> <span><span class="bar"></span>최종수정 2020.11.30. 오전 08:07</span> <a class="press_link" href="http://www.wowtv.co.kr/NewsCenter/News/Read?articleId=A202011300032&t=NN" target="_blank">기사원문</a> </div>] 와 같은 형식으로 저장되어있음 '''
def crawling(self, category_name): # Multi Process PID print(category_name + " PID: " + str(os.getpid())) writer = Writer(category_name=category_name, date=self.date) # 기사 URL 형식 url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str( self.categories.get(category_name)) + "&date=" # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다. day_urls = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month']) print(category_name + " Urls are generated") print("The crawler starts") for URL in day_urls: regex = re.compile("date=(\d+)") news_date = regex.findall(URL)[0] request = self.get_url_data(URL) document = BeautifulSoup(request.content, 'html.parser') # html - newsflash_body - type06_headline, type06 # 각 페이지에 있는 기사들 가져오기 post_temp = document.select( '.newsflash_body .type06_headline li dl') post_temp.extend(document.select('.newsflash_body .type06 li dl')) # 각 페이지에 있는 기사들의 url 저장 post = [] for line in post_temp: post.append(line.a.get( 'href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음 del post_temp for content_url in post: # 기사 URL # 크롤링 대기 시간 sleep(0.01) # 기사 HTML 가져옴 request_content = self.get_url_data(content_url) try: document_content = BeautifulSoup(request_content.content, 'html.parser') except: continue try: # 기사 제목 가져옴 tag_headline = document_content.find_all( 'h3', {'id': 'articleTitle'}, {'class': 'tts_head'}) text_headline = '' # 뉴스 기사 제목 초기화 text_headline = text_headline + ArticleParser.clear_headline( str(tag_headline[0].find_all(text=True))) if not text_headline: # 공백일 경우 기사 제외 처리 continue # 기사 본문 가져옴 tag_content = document_content.find_all( 'div', {'id': 'articleBodyContents'}) text_sentence = '' # 뉴스 기사 본문 초기화 text_sentence = text_sentence + ArticleParser.clear_content( str(tag_content[0].find_all(text=True))) if not text_sentence: # 공백일 경우 기사 제외 처리 continue # 기사 언론사 가져옴 tag_company = document_content.find_all( 'meta', {'property': 'me2:category1'}) text_company = '' # 언론사 초기화 text_company = text_company + str( tag_company[0].get('content')) if not text_company: # 공백일 경우 기사 제외 처리 continue # CSV 작성 wcsv = writer.get_writer_csv() wcsv.writerow([ news_date, category_name, text_company, text_headline, text_sentence, content_url ]) del text_company, text_sentence, text_headline del tag_company del tag_content, tag_headline del request_content, document_content except Exception as ex: # UnicodeEncodeError .. # wcsv.writerow([ex, content_url]) del request_content, document_content pass writer.close()
def __init__(self): self.parser = ArticleParser() self.categories = {'정치': 100, '경제': 101, '사회': 102, '생활문화': 103, 'IT과학': 105, 'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'IT_science': 105} self.selected_categories = [] self.date = {'start_year': 0, 'end_year': 0, 'end_month': 0}
def crawling(self, category_name): # Multi Process PID print(category_name + " PID: " + str(os.getpid())) writer = Writer(category_name=category_name, date=self.date) wcsv = writer.get_writer_csv() wcsv.writerow(["date", "time", "category", "headline", "content"]) # 기사 URL 형식 url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str( self.categories.get(category_name)) + "&date=" # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다. day_urls = self.make_news_page_url(url, self.date['year'], self.date['month'], self.date['day']) print(category_name + " Urls are generated") print("The crawler starts") for URL in day_urls: news_date = self.get_date_from_URL(URL) request = self.get_url_data(URL) document = BeautifulSoup(request.content, 'html.parser') # html - newsflash_body - type06_headline, type06 # 각 페이지에 있는 기사들 가져오기 post_temp = document.select('.newsflash_body .type06_headline li dl') post_temp.extend(document.select('.newsflash_body .type06 li dl')) # 각 페이지에 있는 기사들의 url 저장 post = [] for line in post_temp: post.append(line.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음 del post_temp for content_url in post: # 기사 URL # 크롤링 대기 시간 sleep(0.01) # 기사 HTML 가져옴 request_content = self.get_url_data(content_url) try: document_content = BeautifulSoup(request_content.content, 'html.parser') except: continue try: # 기사 제목 가져옴 tag_headline = document_content.find_all('h3', {'id': 'articleTitle'}, {'class': 'tts_head'}) text_headline = '' # 뉴스 기사 제목 초기화 text_headline = text_headline + ArticleParser.clear_headline(str(tag_headline[0].find_all(text=True))) if not text_headline: # 공백일 경우 기사 제외 처리 continue # 기사 본문 가져옴 tag_content = document_content.find_all('div', {'id': 'articleBodyContents'}) text_sentence = '' # 뉴스 기사 본문 초기화 text_sentence = text_sentence + ArticleParser.clear_content(str(tag_content[0].find_all(text=True))) if not text_sentence or len(text_sentence) < 500: # 공백일 경우 기사 제외 처리 continue # 기사 시간 가져옴 tag_time = document_content.find('span', {'class':'t11'}).text.split(" ")[1:] news_time = " ".join(tag_time) if not news_time: continue # CSV 작성 wcsv = writer.get_writer_csv() wcsv.writerow([news_date, news_time, category_name, text_headline, text_sentence]) del text_sentence, text_headline, news_time del tag_time del tag_content, tag_headline del request_content, document_content print("Done") except Exception as e: # UnicodeEncodeError .. # wcsv.writerow([ex, content_url]) del request_content, document_content print(f"ERROR : {e}")
class ArticleCrawler(object): def __init__(self): self.parser = ArticleParser() self.category = { '정치': 100, '경제': 101, '사회': 102, '생활문화': 103, '세계': 104, 'IT과학': 105 } self.selected_category = [] self.date = {'start_year': 0, 'end_year': 0, 'end_month': 0} def set_category(self, *args): for key in args: if self.category.get(key) is None: raise InvalidCategory(key) else: self.selected_category = args def set_date_range(self, start_year, end_year, end_month): args = [start_year, end_year, end_month] if start_year > end_year: raise InvalidYear(start_year, end_year) if end_month < 1 or end_month > 12: raise InvalidMonth(end_month) for key, date in zip(self.date, args): self.date[key] = date print(self.date) def make_news_page_url(self, category_url, start_year, last_year, start_month, last_month): maked_url = [] final_startmonth = start_month final_lastmonth = last_month for year in range(start_year, last_year + 1): if year != last_year: start_month = 1 last_month = 12 else: start_month = final_startmonth last_month = final_lastmonth for month in range(start_month, last_month + 1): for month_day in range(1, calendar.monthrange(year, month)[1] + 1): url = category_url if len(str(month)) == 1: month = "0" + str(month) if len(str(month_day)) == 1: month_day = "0" + str(month_day) url = url + str(year) + str(month) + str(month_day) final_url = url # page 날짜 정보만 있고 page 정보가 없는 url 임시 저장 # totalpage는 네이버 페이지 구조를 이용해서 page=1000으로 지정해 totalpage를 알아냄 # page=1000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 totalpage = self.parser.find_news_totalpage(final_url + "&page=1000") for page in range(1, totalpage + 1): url = final_url # url page 초기화 url = url + "&page=" + str(page) maked_url.append(url) return maked_url def crawling(self, category_name): # MultiThread PID print(category_name + " PID: " + str(os.getpid())) # 안되면 울거다 file_name = 'Article_' + str(self.category[category_name]) conn = pymongo.MongoClient( 'mongodb://%s:%s@%s:%s/' % (MONGODB_USERID, MONGODB_PASSWORD, MONGODB_HOST, MONGODB_PORT)) print(conn) db = conn.get_database(MONGODB_DATABASE) collection = db[file_name] # 기사 URL 형식 url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str( self.category.get(category_name)) + "&date=" # start_year년 1월 ~ end_year의 end_mpnth 날짜까지 기사를 수집합니다. final_urlday = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], 1, self.date['end_month']) print(category_name + " Urls are generated") print(final_urlday) print(len(final_urlday)) print("크롤링 시작") for URL in final_urlday: regex = re.compile("date=(\d+)") news_date = regex.findall(URL)[0] request = requests.get(URL) document = BeautifulSoup(request.content, 'html.parser') tag_document = document.find_all('dt', {'class': 'photo'}) post = [] row = 0 for tag in tag_document: post.append( tag.a.get('href')) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음 for content_url in post: # 기사 URL # 크롤링 대기 시간 sleep(0.01) # 기사 HTML 가져옴 request_content = requests.get(content_url) document_content = BeautifulSoup(request_content.content, 'html.parser') try: # 기사 제목 가져옴 tag_headline = document_content.find_all( 'h3', {'id': 'articleTitle'}, {'class': 'tts_head'}) text_headline = '' # 뉴스 기사 제목 초기화 text_headline = text_headline + self.parser.clear_headline( str(tag_headline[0].find_all(text=True))) if not text_headline: # 공백일 경우 기사 제외 처리 continue # 기사 본문 가져옴 tag_content = document_content.find_all( 'div', {'id': 'articleBodyContents'}) text_sentence = '' # 뉴스 기사 본문 초기화 text_sentence = text_sentence + self.parser.clear_content( str(tag_content[0].find_all(text=True))) if not text_sentence: # 공백일 경우 기사 제외 처리 continue # 기사 언론사 가져옴 tag_company = document_content.find_all( 'meta', {'property': 'me2:category1'}) text_company = '' # 언론사 초기화 text_company = text_company + str( tag_company[0].get('content')) if not text_company: # 공백일 경우 기사 제외 처리 continue # 기사 이미지 가져옴 tag_image = document_content.find_all( 'span', {'class': 'end_photo_org'}) image_url = '' # 이미지 초기화 image_url = image_url + str( tag_image[0].find('img')['src']) image_path = "images/" + file_name + "_" + str( row) + "_" + str(news_date) + '.png' urllib.request.urlretrieve(image_url, image_path) row = row + 1 if not image_url: # 공백일 경우 기사 제외 처리 continue collection.insert_one({ "data": { "headline": text_headline, "content": text_sentence, "company": text_company, "image": image_path } }) except Exception as ex: pass def start(self): # MultiProcess 크롤링 시작 for category_name in self.selected_category: proc = Process(target=self.crawling, args=(category_name, )) proc.start()
class ArticleCrawler(object): def __init__(self): self.parser = ArticleParser() self.categories = { '야구': "kbaseball", '해외야구': "wbaseball", '축구': "kfootball", '해외축구': "wfootball", '농구': "basketball", '배구': "volleyball", '골프': "golf", '일반': "general", 'e스포츠': "esports" } self.selected_categories = [] self.date = { 'start_year': 0, 'start_month': 0, 'start_day': 0, 'end_year': 0, 'end_month': 0, 'end_day': 0 } self.user_operating_system = str(platform.system()) def set_category(self, *args): for key in args: if self.categories.get(key) is None: raise InvalidCategory(key) self.selected_categories = args def set_date_range(self, start_year, start_month, start_day, end_year, end_month, end_day): args = [ start_year, start_month, start_day, end_year, end_month, end_day ] if start_year > end_year: raise InvalidYear(start_year, end_year) if start_month < 1 or start_month > 12: raise InvalidMonth(start_month) if end_month < 1 or end_month > 12: raise InvalidMonth(end_month) if start_day < 1 or start_day > calendar.monthrange( start_year, start_month)[1]: raise InvalidDay(start_day) if end_day < 1 or end_day > calendar.monthrange( start_year, start_month)[1]: raise InvalidDay(end_day) for key, date in zip(self.date, args): self.date[key] = date print(self.date) def make_news_page_url(self, category_url, start_year, end_year, start_month, end_month, start_day, end_day): total_url_list = [] for year in range(start_year, end_year + 1): if start_year == end_year: year_start_month = start_month year_end_month = end_month else: if year == start_year: year_start_month = start_month year_end_month = 12 elif year == end_year: year_start_month = 1 year_end_month = end_month else: year_start_month = 1 year_end_month = 12 for month in range(year_start_month, year_end_month + 1): if year_start_month == year_end_month: start_day_tmp = start_day end_day_tmp = end_day else: if month == year_start_month: start_day_tmp = start_day end_day_tmp = calendar.monthrange(year, month)[1] elif month == year_end_month: start_day_tmp = 1 end_day_tmp = end_month else: start_day_tmp = 1 end_day_tmp = calendar.monthrange(year, month)[1] for month_day in range(start_day_tmp, end_day_tmp + 1): if len(str(month)) == 1: month = "0" + str(month) if len(str(month_day)) == 1: month_day = "0" + str(month_day) # page 날짜 정보만 있고 page 정보가 없는 url 저장 url = category_url + str(year) + str(month) + str( month_day) # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄 # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 기사 리스트의 가장 마지막 페이지 (page=totalpage)로 이동 됨 totalpage = 0 totalpage = self.parser.find_news_totalpage(url + "&page=10000") for page in range(1, totalpage + 1): if totalpage: total_url_list.append(url + "&page=" + str(page)) # 월, 일의 자릿수를 2자리로 맞추기 print_start_month = self.appendI2S(start_month) print_end_month = self.appendI2S(end_month) print_start_day = self.appendI2S(start_day) print_end_day = self.appendI2S(end_day) print('Crawling date range: ' + str(start_year) + str(print_start_month) + str(print_start_day) + '~' + str(end_year) + str(print_end_month) + str(print_end_day)) return total_url_list def crawling(self, category_name): # MultiThread PID print(category_name + " PID: " + str(os.getpid())) # csv 파일 이름에 들어갈 month 자릿수 맞추기 save_start_month = self.appendI2S(self.date['start_month']) save_end_month = self.appendI2S(self.date['end_month']) save_start_day = self.appendI2S(self.date['start_day']) save_end_day = self.appendI2S(self.date['end_day']) # 각 카테고리 기사 저장 할 CSV # Windows use euc-kr file = open(dataset_location + 'Article_' + category_name + '_' + str(self.date['start_year']) + save_start_month + save_start_day + '_' + str(self.date['end_year']) + save_end_month + save_end_day + '.csv', 'w', encoding='euc-kr', newline='') wcsv = csv.writer(file) del save_start_month, save_end_month # 기사 리스트 URL 형식 url = "https://sports.news.naver.com/" + str( self.categories.get( category_name)) + "/news/index.nhn?isphoto=N&date=" print(url) # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집 #url_list = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month']) url_list = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'], self.date['start_day'], self.date['end_day']) print(category_name + " Urls are generated") print("The crawler starts") print("=========================================") article_count = 0 for URL in url_list: regex = re.compile("date=(\d+)") news_date = regex.findall(URL)[0] driver = webdriver.Chrome('chromedriver.exe_location') driver.get(URL) html = driver.page_source # 페이지의 html 정보 load bs_obj = BeautifulSoup(html, 'html.parser') # html의 class를 이용하여 각 페이지에 있는 기사들 가져오기 article_url_list = bs_obj.select('.news_list .text') # 각 페이지에 있는 기사들의 url 저장 post_url_list = [] for line in article_url_list: post_url_list.append("https://sports.news.naver.com" + line.a.get('href') ) # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음 del article_url_list for content_url in post_url_list: # 기사 URL # 크롤링 대기 시간 sleep(0.01) # 기사 HTML 가져옴 request_content = requests.get(content_url) document_content = BeautifulSoup(request_content.content, 'html.parser') try: # 기사 제목 가져옴 tag_headline = document_content.find_all( 'h4', {'class': 'title'}) text_headline = '' # 뉴스 기사 제목 초기화 text_headline = text_headline + self.parser.clear_headline( str(tag_headline[0].find_all(text=True))) if not text_headline: # 공백일 경우 기사 제외 처리 continue # 기사 제목 가져옴 tag_time = document_content.find('div', { 'class': 'info' }).find('span') regex = re.compile("오[전,후]\s\d\d:\d\d") match = regex.findall(str(tag_time))[0] text_time = '' # 뉴스 기사 제목 초기화 text_time = text_time + match if not text_time: # 공백일 경우 기사 제외 처리 exit() # 기사 본문 가져옴 tag_content = document_content.find_all( 'div', {'id': 'newsEndContents'}) text_sentence = '' # 뉴스 기사 본문 초기화 text_sentence = text_sentence + self.parser.clear_content( str(tag_content[0].find_all(text=True))) if not text_sentence: # 공백일 경우 기사 제외 처리 continue # CSV 작성 wcsv.writerow([ news_date, text_time, text_headline, text_sentence, content_url ]) article_count = article_count + 1 del text_sentence, text_headline, text_time del tag_content, tag_headline del request_content, document_content except Exception as ex: # UnicodeEncodeError .. # wcsv.writerow([ex, content_url]) del request_content, document_content pass print("The crawler finished!!") print("Number of crawling articles : " + str(article_count)) file.close() def appendI2S(self, input_int): if len(str(input_int)) == 1: out_string = "0" + str(input_int) else: out_string = str(input_int) return out_string def start(self): # MultiProcess 크롤링 시작 for category_name in self.selected_categories: proc = Process(target=self.crawling, args=(category_name, )) proc.start()