示例#1
0
    def press_crawling(self, oid=215, aid=20):
        headers = {'User-Agent': 'Mozilla/5.0'}
        writer = Writer_press(category_name="aid123")
        url = 'https://sports.news.naver.com/news.nhn?'
        oid = 'oid=' + str(oid)
        #여기서 입력받은 oid(언론사 id)를 지정할 거에요
        for i in tqdm(range(1, aid), desc="Crawling rate", mininterval=0.01):
            #print(i)
            aid = str(i)
            aid_length = len(aid)
            aid = '&aid=' + '0' * (10 - aid_length) + aid
            url1 = url + oid + aid
            b = requests.get(url1, headers=headers)
            #print(url1)
            document = BeautifulSoup(b.content, 'html.parser')
            tag_content = document.find_all('div', {'id': 'newsEndContents'})

            text_sentence = ''
            text_sentence = text_sentence + ArticleParser.clear_content(
                str(tag_content[0].find_all(text=True)))
            #print(text_sentence)
            headline = ''
            tag_headline = document.find_all('h4', {'class': 'title'})
            headline = headline + ArticleParser.clear_headline(
                str(tag_headline[0].find_all(text=True)))
            article_info = document.find_all('div', {'class': 'info'})
            #여기서 article info 정제 작업을 하며 좋을 것 같아요.
            writer.wcsv.writerow([headline, text_sentence, url1])
            print()
        writer.close()
        '''
示例#2
0
    def Keyword_crawling(self, keyword):
        headers = {'User-Agent': 'Mozilla/5.0'}
        url = 'https://search.naver.com/search.naver?sm=tab_hty.top&where=news&query='
        url = url + keyword
        list_a = []
        for i in tqdm(range(0, 20), desc="append url list", mininterval=0.01):
            s_num = "&start=" + str(i) + "1"
            b = requests.get(url + s_num, headers=headers)
            document = BeautifulSoup(b.content, 'html.parser')
            writer = Writer_press(category_name='_' + keyword,
                                  text_c="Keyword_crawling")
            list_url = document.select('.list_news .info')

            for line in list_url:
                list_a.append(line.get('href'))
                #print(line.get('href'))
            print('')
        list_b = []
        list_c = []
        for line in list_a:
            if (line == None):
                continue
            elif (line.find('naver') != -1):
                list_b.append(line)
        for line in list_b:
            fnum = line.find('oid')
            if len(line[fnum:]) == 22:
                list_c.append(line[fnum:])
        url = 'https://sports.news.naver.com/news.nhn?'
        list_num = len(list_c)
        print(list_c)
        for i in tqdm(range(0, list_num),
                      desc="Crawling rate",
                      mininterval=0.01):
            print('')
            url_a = url + list_c[i]
            b = requests.get(url_a, headers=headers)
            document = BeautifulSoup(b.content, 'html.parser')
            tag_content = document.find_all('div', {'id': 'newsEndContents'})
            if len(tag_content) != 0:
                text_sentence = ''
                text_sentence = text_sentence + ArticleParser.clear_content(
                    str(tag_content[0].find_all(text=True)))
                headline = ''
                tag_headline = document.find_all('h4', {'class': 'title'})
                if (len(tag_headline) != 0):
                    headline = headline + ArticleParser.clear_headline(
                        str(tag_headline[0].find_all(text=True)))
                if headline == '':
                    headline = '-'
                article_info = document.find_all('div', {'class': 'info'})
                writer.wcsv.writerow(
                    [headline + '\t' + text_sentence + '\t' + url_a])
 def __init__(self):
     self.parser = ArticleParser()
     self.category = {
         '정치': 100,
         '경제': 101,
         '사회': 102,
         '생활문화': 103,
         '세계': 104,
         'IT과학': 105
     }
     self.selected_category = []
     self.date = {'start_year': 0, 'end_year': 0, 'end_month': 0}
示例#4
0
    def press_crawling(self, oid, aid, name):
        headers = {'User-Agent': 'Mozilla/5.0'}

        url = 'https://sports.news.naver.com/news.nhn?'

        writer = Writer_press(category_name=str(aid), text_c=name)
        self.writer = writer

        oid = 'oid=' + oid
        for i in tqdm(range(1, aid), desc="Crawling rate", mininterval=0.01):
            self.num += 1
            aid = str(i)
            aid_length = len(aid)
            aid = '&aid=' + '0' * (10 - aid_length) + aid
            url1 = url + oid + aid
            b = requests.get(url1, headers=headers)
            #print(url1)
            document = BeautifulSoup(b.content, 'html.parser')
            tag_content = document.find_all('div', {'id': 'newsEndContents'})

            if len(tag_content) != 0:
                #print(url1)
                text_sentence = ''
                text_sentence = text_sentence + ArticleParser.clear_content(
                    str(tag_content[0].find_all(text=True)))
                #print(text_sentence)
                headline = ''
                tag_headline = document.find_all('h4', {'class': 'title'})
                headline = headline + ArticleParser.clear_headline(
                    str(tag_headline[0].find_all(text=True)))
                article_info = document.find_all('div', {'class': 'info'})
                #여기서 article info 정제 작업을 하며 좋을 것 같아요.

                #====================================================
                #기사 입력 시간 정보 받기
                input_time = str(article_info[0])
                iTime = ""
                iTime = self.inputTime(input_time)
                #print("\n")
                #print(iTime)
                #===================================================
                if headline == '':
                    headline = '-'
                writer.wcsv.writerow([
                    headline + '\t' + text_sentence + '\t' + url1 + '\t' +
                    iTime
                ])
            print()
        writer.close()
        '''
 def get_sentence_from_document(self, document):
     tag_content = document.find_all('div', {'id': 'articleBodyContents'})
     text_sentence = ''  # 뉴스 기사 본문 초기화
     text_sentence = text_sentence + ArticleParser.clear_content(
         str(tag_content[0].find_all(text=True)))
     if not text_sentence or len(text_sentence) < 500:  # 공백일 경우 기사 제외 처리
         return None
     else:
         return text_sentence
 def get_headline_from_document(self, document):
     tag_headline = document.find_all('h3', {'id': 'articleTitle'},
                                      {'class': 'tts_head'})
     text_headline = ''  # 뉴스 기사 제목 초기화
     text_headline = text_headline + ArticleParser.clear_headline(
         str(tag_headline[0].find_all(text=True)))
     if not text_headline:  # 공백일 경우 기사 제외 처리
         return None
     else:
         return text_headline
def get_news_title(n_url):
    breq = requests.get(n_url)
    bsoup = BeautifulSoup(breq.content, 'html.parser')

    get_title_raw = bsoup.select(
        'h3#articleTitle')[0].text  # 대괄호는  h3#articleTitle 인 것중 첫번째 그룹만 가져오겠다.
    get_title = ''
    get_title = get_title + ArticleParser.clear_headline(get_title_raw)

    if not get_title:
        return None

    return get_title
    def make_news_page_url(category_url, year, month, day):
        made_urls = []
        if len(str(month)) == 1:
            month = "0" + str(month)
        if len(str(day)) == 1:
            day = "0" + str(day)
        url = category_url + str(year) + str(month) + str(day)
        totalpage = ArticleParser.find_news_totalpage(url + "&page=10000")
        print(totalpage)
        for page in range(1, totalpage + 1):
            made_urls.append(url + "&page=" + str(page))

        return made_urls
示例#9
0
    def __init__(self):
        self.parser = ArticleParser()
        self.categories = {
            '야구': "kbaseball",
            '해외야구': "wbaseball",
            '축구': "kfootball",
            '해외축구': "wfootball",
            '농구': "basketball",
            '배구': "volleyball",
            '골프': "golf",
            '일반': "general",
            'e스포츠': "esports"
        }

        self.selected_categories = []
        self.date = {
            'start_year': 0,
            'start_month': 0,
            'start_day': 0,
            'end_year': 0,
            'end_month': 0,
            'end_day': 0
        }
        self.user_operating_system = str(platform.system())
    def make_news_page_url(category_url, start_year, end_year, start_month, end_month, start_day, end_day):
        made_urls = []
        for year in range(start_year, end_year + 1):
            if start_year == end_year:
                year_startmonth = start_month
                year_endmonth = end_month
            else:
                if year == start_year:
                    year_startmonth = start_month
                    year_endmonth = 12
                elif year == end_year:
                    year_startmonth = 1
                    year_endmonth = end_month
                else:
                    year_startmonth = 1
                    year_endmonth = 12

            for month in range(year_startmonth, year_endmonth + 1):
                if start_month == end_month:
                    month_startday = start_day
                    month_endday = end_day
                else:
                    if month == start_month:
                        month_startday = start_day
                        month_endday = calendar.monthrange(year, month)[1]
                    elif month == end_month:
                        month_startday = 1
                        month_endday = end_day
                    else:
                        month_startday = 1
                        month_endday = calendar.monthrange(year, month)[1]

                for day in range(month_startday, month_endday + 1):
                    if len(str(month)) == 1:
                        month = "0" + str(month)
                    if len(str(day)) == 1:
                        day = "0" + str(day)

                        # 날짜별로 Page Url 생성
                    url = category_url + str(year) + str(month) + str(day)

                    # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄
                    # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect)
                    totalpage = ArticleParser.find_news_totalpage(url + "&page=10000")
                    for page in range(1, totalpage + 1):
                        made_urls.append(url + "&page=" + str(page))

        return made_urls
def get_news_content(n_url):
    breq = requests.get(n_url)
    bsoup = BeautifulSoup(breq.content, 'html.parser')

    _text = bsoup.select('#articleBodyContents')[0].get_text().replace(
        '\n', " ")
    btext = _text.replace(
        "// flash 오류를 우회하기 위한 함수 추가 function _flash_removeCallback() {}", "")

    get_content = ''
    get_content = get_content + ArticleParser.clear_content(btext)

    if not get_content:
        return None

    return get_content
示例#12
0
    def make_news_page_url(category_url, start_year, end_year, start_month,
                           end_month):
        print("here is make_news_page_url")
        made_urls = []
        for year in range(start_year, end_year + 1):
            print("start_year , end_year = " + " " + str(start_year) + " " +
                  str(end_year))
            print("start_month , end_month = " + " " + str(start_month) + " " +
                  str(end_month))
            if start_year == end_year:
                year_startmonth = start_month
                year_endmonth = end_month
            else:
                if year == start_year:
                    year_startmonth = start_month
                    year_endmonth = 12
                elif year == end_year:
                    year_startmonth = 1
                    year_endmonth = end_month
                else:
                    year_startmonth = 1
                    year_endmonth = 12

            for month in range(year_startmonth, year_endmonth + 1):
                for month_day in range(1,
                                       calendar.monthrange(year, month)[1] +
                                       1):
                    # for month_day in range(1, 2):

                    print("month, day = " + str(month) + " " + str(month_day))
                    if len(str(month)) == 1:
                        month = "0" + str(month)
                    if len(str(month_day)) == 1:
                        month_day = "0" + str(month_day)

                    # 날짜별로 Page Url 생성
                    url = category_url + str(year) + str(month) + str(
                        month_day)
                    print("url = " + url)
                    # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄
                    # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect)
                    totalpage = ArticleParser.find_news_totalpage(
                        url + "&page=10000")
                    print("totalpage = " + str(totalpage))
                    for page in range(1, totalpage + 1):
                        made_urls.append(url + "&page=" + str(page))
        return made_urls
示例#13
0
    def make_news_page_url(self, category_url, start_year, end_year,
                           start_month, end_month):

        for year in range(start_year, end_year + 1):

            if start_year == end_year:
                year_startmonth = start_month
                year_endmonth = end_month
            else:
                if year == start_year:
                    year_startmonth = start_month
                    year_endmonth = 12
                elif year == end_year:
                    year_startmonth = 1
                    year_endmonth = end_month
                else:
                    year_startmonth = 1
                    year_endmonth = 12

            for month in tqdm(range(year_startmonth, year_endmonth + 1),
                              desc="MakeUrl rate",
                              mininterval=0.01):
                print('\n')
                for month_day in range(1,
                                       calendar.monthrange(year, month)[1] +
                                       1):

                    if len(str(month)) == 1:
                        month = "0" + str(month)
                    if len(str(month_day)) == 1:
                        month_day = "0" + str(month_day)

                    # 날짜별로 Page Url 생성
                    url = category_url + str(year) + str(month) + str(
                        month_day)

                    # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄
                    # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨 (Redirect)

                    totalpage = ArticleParser.find_news_totalpage(
                        url + "&page=10000")

                    for page in range(1, totalpage + 1):
                        self.made_urls.append(url + "&page=" + str(page))
        print("url개수: " + str(len(self.made_urls)))
        return self.made_urls
示例#14
0
logging.basicConfig(format=u'[%(asctime)s] # %(levelname)-8s [%(filename)s] %(message)s',
                    filename="web_parser.log", level=logging.INFO)


def readBaseDataSetFromFile(path):
    with open(path, encoding='UTF8') as json_file:
        data = json.load(json_file)

    return data


if __name__ == "__main__":
    # from nltk.tokenize import sent_tokenize # разбивает на предложения

    # logging.info('Program started')
    article_parser = ArticleParser("mongodb://*****:*****@185.246.152.112/daryana")

    # article_parser.createDefaultSet(readBaseDataSetFromFile("meanings.json"))

    # article_parser.selfTeaching(2000, 200)
    # article_parser.resetMeanings()
    # article_parser.setMeanings()
    article_parser.classify()
    generateImgs()
    # generateImgs("first_def_set.json", "first_")

    # pikabu_urls = [
    #     "https://pikabu.ru/tag/iphone",
    #     "https://pikabu.ru/tag/apple",
    #     "https://pikabu.ru/tag/ios",
    #     "https://pikabu.ru/tag/macos",
示例#15
0
class ArticleCrawler(object):
    def __init__(self):
        self.parser = ArticleParser()
        self.categories = {'정치': 100, '경제': 101, '사회': 102, '생활문화': 103, 'IT과학': 105}
        self.selected_categories = []
        self.date = {'start_year': 0, 'end_year': 0, 'end_month': 0}

    def set_category(self, *args):
        for key in args:
            if self.categories.get(key) is None:
                raise InvalidCategory(key)
        self.selected_categories = args

    def set_date_range(self, start_year, end_year, end_month):
        args = [start_year, end_year, end_month]
        if start_year > end_year:
            raise InvalidYear(start_year, end_year)
        if end_month < 1 or end_month > 12:
            raise InvalidMonth(end_month)
        for key, date in zip(self.date, args):
            self.date[key] = date
        print(self.date)

    def make_news_page_url(self, category_url, start_year, last_year, start_month, last_month):
        maked_url = []
        final_startmonth = start_month
        final_lastmonth = last_month
        for year in range(start_year, last_year + 1):
            if year != last_year:
                start_month = 1
                last_month = 12
            else:
                start_month = final_startmonth
                last_month = final_lastmonth
            for month in range(start_month, last_month + 1):
                for month_day in range(1, calendar.monthrange(year, month)[1] + 1):
                    url = category_url
                    if len(str(month)) == 1:
                        month = "0" + str(month)
                    if len(str(month_day)) == 1:
                        month_day = "0" + str(month_day)
                    url = url + str(year) + str(month) + str(month_day)
                    final_url = url  # page 날짜 정보만 있고 page 정보가 없는 url 임시 저장

                    # totalpage는 네이버 페이지 구조를 이용해서 page=1000으로 지정해 totalpage를 알아냄
                    # page=1000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨
                    totalpage = self.parser.find_news_totalpage(final_url + "&page=1000")
                    for page in range(1, totalpage + 1):
                        url = final_url  # url page 초기화
                        url = url + "&page=" + str(page)
                        maked_url.append(url)
        return maked_url

    def crawling(self, category_name):
        # MultiThread PID
        print(category_name + " PID: " + str(os.getpid()))

        # 각 카테고리 기사 저장 할 CSV
        file = open('Article_' + category_name + '.csv', 'w', encoding='euc_kr', newline='')
        wcsv = csv.writer(file)

        # 기사 URL 형식
        url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
            self.categories.get(category_name)) + "&date="
        # start_year년 1월 ~ end_year의 end_mpnth 날짜까지 기사를 수집합니다.
        final_urlday = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], 1, self.date['end_month'])
        print(category_name + " Urls are generated")
        print("The crawler starts")

        for URL in final_urlday:

            regex = re.compile("date=(\d+)")
            news_date = regex.findall(URL)[0]

            request = requests.get(URL)
            document = BeautifulSoup(request.content, 'html.parser')
            tag_document = document.find_all('dt', {'class': 'photo'})

            post = []
            for tag in tag_document:
                post.append(tag.a.get('href'))  # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음

            for content_url in post:  # 기사 URL
                # 크롤링 대기 시간
                sleep(0.01)
                # 기사 HTML 가져옴
                request_content = requests.get(content_url)
                document_content = BeautifulSoup(request_content.content, 'html.parser')

                try:
                    # 기사 제목 가져옴
                    tag_headline = document_content.find_all('h3', {'id': 'articleTitle'}, {'class': 'tts_head'})
                    text_headline = ''  # 뉴스 기사 제목 초기화
                    text_headline = text_headline + self.parser.clear_headline(str(tag_headline[0].find_all(text=True)))
                    if not text_headline:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 본문 가져옴
                    tag_content = document_content.find_all('div', {'id': 'articleBodyContents'})
                    text_sentence = ''  # 뉴스 기사 본문 초기화
                    text_sentence = text_sentence + self.parser.clear_content(str(tag_content[0].find_all(text=True)))
                    if not text_sentence:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 언론사 가져옴
                    tag_company = document_content.find_all('meta', {'property': 'me2:category1'})
                    text_company = ''  # 언론사 초기화
                    text_company = text_company + str(tag_company[0].get('content'))
                    if not text_company:  # 공백일 경우 기사 제외 처리
                        continue
                    # CSV 작성
                    wcsv.writerow([news_date, category_name, text_company, text_headline, text_sentence, content_url])

                except Exception as ex:  # UnicodeEncodeError ..
                    pass
        file.close()

    def start(self):
        # MultiThread 크롤링 시작
        for category_name in self.selected_categories:
            proc = Process(target=self.crawling, args=(category_name,))
            proc.start()
示例#16
0
import requests
from bs4 import BeautifulSoup
from articleparser import ArticleParser

headers = {'User-Agent': 'Mozilla/5.0'}
b = requests.get(
    'https://sports.news.naver.com/news.nhn?oid=215&aid=0000918970',
    headers=headers)
document = BeautifulSoup(b.content, 'html.parser')
tag_content = document.find_all('div', {'id': 'newsEndContents'})
text_sentence = ''
text_sentence = text_sentence + ArticleParser.clear_content(
    str(tag_content[0].find_all(text=True)))
headline = ''
headline = headline + ArticleParser.clear_headline(
    str(document.find_all('h4', {'class': 'title'})))
article_info = document.find_all('div', {'class': 'info'})
# text_sentence는 기사 본문을 저장한 값
# headline은 기사 제목을 저장한 값
'''
article_info는 기사 정보를 저장한 값
[<div class="info">
<span>기사입력 2020.11.30. 오전 08:05</span>
<span><span class="bar"></span>최종수정 2020.11.30. 오전 08:07</span>
<a class="press_link" href="http://www.wowtv.co.kr/NewsCenter/News/Read?articleId=A202011300032&amp;t=NN" target="_blank">기사원문</a>
</div>]
와 같은 형식으로 저장되어있음
'''
示例#17
0
    def crawling(self, category_name):
        # Multi Process PID
        print(category_name + " PID: " + str(os.getpid()))

        writer = Writer(category_name=category_name, date=self.date)

        # 기사 URL 형식
        url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
            self.categories.get(category_name)) + "&date="

        # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
        day_urls = self.make_news_page_url(url, self.date['start_year'],
                                           self.date['end_year'],
                                           self.date['start_month'],
                                           self.date['end_month'])
        print(category_name + " Urls are generated")
        print("The crawler starts")

        for URL in day_urls:

            regex = re.compile("date=(\d+)")
            news_date = regex.findall(URL)[0]

            request = self.get_url_data(URL)

            document = BeautifulSoup(request.content, 'html.parser')

            # html - newsflash_body - type06_headline, type06
            # 각 페이지에 있는 기사들 가져오기
            post_temp = document.select(
                '.newsflash_body .type06_headline li dl')
            post_temp.extend(document.select('.newsflash_body .type06 li dl'))

            # 각 페이지에 있는 기사들의 url 저장
            post = []
            for line in post_temp:
                post.append(line.a.get(
                    'href'))  # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음
            del post_temp

            for content_url in post:  # 기사 URL
                # 크롤링 대기 시간
                sleep(0.01)

                # 기사 HTML 가져옴
                request_content = self.get_url_data(content_url)
                try:
                    document_content = BeautifulSoup(request_content.content,
                                                     'html.parser')
                except:
                    continue

                try:
                    # 기사 제목 가져옴
                    tag_headline = document_content.find_all(
                        'h3', {'id': 'articleTitle'}, {'class': 'tts_head'})
                    text_headline = ''  # 뉴스 기사 제목 초기화
                    text_headline = text_headline + ArticleParser.clear_headline(
                        str(tag_headline[0].find_all(text=True)))
                    if not text_headline:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 본문 가져옴
                    tag_content = document_content.find_all(
                        'div', {'id': 'articleBodyContents'})
                    text_sentence = ''  # 뉴스 기사 본문 초기화
                    text_sentence = text_sentence + ArticleParser.clear_content(
                        str(tag_content[0].find_all(text=True)))
                    if not text_sentence:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 언론사 가져옴
                    tag_company = document_content.find_all(
                        'meta', {'property': 'me2:category1'})
                    text_company = ''  # 언론사 초기화
                    text_company = text_company + str(
                        tag_company[0].get('content'))
                    if not text_company:  # 공백일 경우 기사 제외 처리
                        continue

                    # CSV 작성
                    wcsv = writer.get_writer_csv()
                    wcsv.writerow([
                        news_date, category_name, text_company, text_headline,
                        text_sentence, content_url
                    ])

                    del text_company, text_sentence, text_headline
                    del tag_company
                    del tag_content, tag_headline
                    del request_content, document_content

                except Exception as ex:  # UnicodeEncodeError ..
                    # wcsv.writerow([ex, content_url])
                    del request_content, document_content
                    pass
        writer.close()
示例#18
0
 def __init__(self):
     self.parser = ArticleParser()
     self.categories = {'정치': 100, '경제': 101, '사회': 102, '생활문화': 103, 'IT과학': 105,
                        'politics': 100, 'economy': 101, 'society': 102, 'living_culture': 103, 'IT_science': 105}
     self.selected_categories = []
     self.date = {'start_year': 0, 'end_year': 0, 'end_month': 0}
示例#19
0
    def crawling(self, category_name):
        # Multi Process PID
        print(category_name + " PID: " + str(os.getpid()))

        writer = Writer(category_name=category_name, date=self.date)
        wcsv = writer.get_writer_csv()
        wcsv.writerow(["date", "time", "category", "headline", "content"])
        

        # 기사 URL 형식
        url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
            self.categories.get(category_name)) + "&date="

        # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집합니다.
        day_urls = self.make_news_page_url(url, self.date['year'], self.date['month'], self.date['day'])
        print(category_name + " Urls are generated")
        print("The crawler starts")


        for URL in day_urls:
            news_date = self.get_date_from_URL(URL)
            request = self.get_url_data(URL)
            document = BeautifulSoup(request.content, 'html.parser')

            # html - newsflash_body - type06_headline, type06
            # 각 페이지에 있는 기사들 가져오기
            post_temp = document.select('.newsflash_body .type06_headline li dl')
            post_temp.extend(document.select('.newsflash_body .type06 li dl'))

            # 각 페이지에 있는 기사들의 url 저장
            post = []
            for line in post_temp:
                post.append(line.a.get('href'))  # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음
            del post_temp

            for content_url in post:  # 기사 URL
                # 크롤링 대기 시간
                sleep(0.01)

                # 기사 HTML 가져옴
                request_content = self.get_url_data(content_url)
                try:
                    document_content = BeautifulSoup(request_content.content, 'html.parser')
                except:
                    continue


                try:
                    # 기사 제목 가져옴
                    tag_headline = document_content.find_all('h3', {'id': 'articleTitle'}, {'class': 'tts_head'})
                    text_headline = ''  # 뉴스 기사 제목 초기화
                    text_headline = text_headline + ArticleParser.clear_headline(str(tag_headline[0].find_all(text=True)))
                    if not text_headline:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 본문 가져옴
                    tag_content = document_content.find_all('div', {'id': 'articleBodyContents'})
                    text_sentence = ''  # 뉴스 기사 본문 초기화
                    text_sentence = text_sentence + ArticleParser.clear_content(str(tag_content[0].find_all(text=True)))
                    if not text_sentence or len(text_sentence) < 500:  # 공백일 경우 기사 제외 처리
                        continue


                    # 기사 시간 가져옴
                    tag_time = document_content.find('span', {'class':'t11'}).text.split(" ")[1:]
                    news_time = " ".join(tag_time)
                    if not news_time:
                        continue


                    # CSV 작성
                    wcsv = writer.get_writer_csv()
                    wcsv.writerow([news_date, news_time, category_name, text_headline, text_sentence])
                    
                    del text_sentence, text_headline, news_time
                    del tag_time
                    del tag_content, tag_headline
                    del request_content, document_content

                    print("Done")



                except Exception as e:  # UnicodeEncodeError ..
                    # wcsv.writerow([ex, content_url])
                    del request_content, document_content
                    print(f"ERROR : {e}")
class ArticleCrawler(object):
    def __init__(self):
        self.parser = ArticleParser()
        self.category = {
            '정치': 100,
            '경제': 101,
            '사회': 102,
            '생활문화': 103,
            '세계': 104,
            'IT과학': 105
        }
        self.selected_category = []
        self.date = {'start_year': 0, 'end_year': 0, 'end_month': 0}

    def set_category(self, *args):
        for key in args:
            if self.category.get(key) is None:
                raise InvalidCategory(key)
            else:
                self.selected_category = args

    def set_date_range(self, start_year, end_year, end_month):
        args = [start_year, end_year, end_month]
        if start_year > end_year:
            raise InvalidYear(start_year, end_year)
        if end_month < 1 or end_month > 12:
            raise InvalidMonth(end_month)
        for key, date in zip(self.date, args):
            self.date[key] = date
        print(self.date)

    def make_news_page_url(self, category_url, start_year, last_year,
                           start_month, last_month):
        maked_url = []
        final_startmonth = start_month
        final_lastmonth = last_month
        for year in range(start_year, last_year + 1):
            if year != last_year:
                start_month = 1
                last_month = 12
            else:
                start_month = final_startmonth
                last_month = final_lastmonth
            for month in range(start_month, last_month + 1):
                for month_day in range(1,
                                       calendar.monthrange(year, month)[1] +
                                       1):
                    url = category_url
                    if len(str(month)) == 1:
                        month = "0" + str(month)
                    if len(str(month_day)) == 1:
                        month_day = "0" + str(month_day)
                    url = url + str(year) + str(month) + str(month_day)
                    final_url = url  # page 날짜 정보만 있고 page 정보가 없는 url 임시 저장

                    # totalpage는 네이버 페이지 구조를 이용해서 page=1000으로 지정해 totalpage를 알아냄
                    # page=1000을 입력할 경우 페이지가 존재하지 않기 때문에 page=totalpage로 이동 됨
                    totalpage = self.parser.find_news_totalpage(final_url +
                                                                "&page=1000")
                    for page in range(1, totalpage + 1):
                        url = final_url  # url page 초기화
                        url = url + "&page=" + str(page)
                        maked_url.append(url)
        return maked_url

    def crawling(self, category_name):
        # MultiThread PID
        print(category_name + " PID: " + str(os.getpid()))

        # 안되면 울거다
        file_name = 'Article_' + str(self.category[category_name])

        conn = pymongo.MongoClient(
            'mongodb://%s:%s@%s:%s/' %
            (MONGODB_USERID, MONGODB_PASSWORD, MONGODB_HOST, MONGODB_PORT))
        print(conn)
        db = conn.get_database(MONGODB_DATABASE)
        collection = db[file_name]
        # 기사 URL 형식
        url = "http://news.naver.com/main/list.nhn?mode=LSD&mid=sec&sid1=" + str(
            self.category.get(category_name)) + "&date="
        # start_year년 1월 ~ end_year의 end_mpnth 날짜까지 기사를 수집합니다.
        final_urlday = self.make_news_page_url(url, self.date['start_year'],
                                               self.date['end_year'], 1,
                                               self.date['end_month'])
        print(category_name + " Urls are generated")
        print(final_urlday)
        print(len(final_urlday))
        print("크롤링 시작")
        for URL in final_urlday:

            regex = re.compile("date=(\d+)")
            news_date = regex.findall(URL)[0]

            request = requests.get(URL)
            document = BeautifulSoup(request.content, 'html.parser')
            tag_document = document.find_all('dt', {'class': 'photo'})

            post = []
            row = 0
            for tag in tag_document:
                post.append(
                    tag.a.get('href'))  # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음

            for content_url in post:  # 기사 URL
                # 크롤링 대기 시간
                sleep(0.01)
                # 기사 HTML 가져옴
                request_content = requests.get(content_url)
                document_content = BeautifulSoup(request_content.content,
                                                 'html.parser')

                try:

                    # 기사 제목 가져옴
                    tag_headline = document_content.find_all(
                        'h3', {'id': 'articleTitle'}, {'class': 'tts_head'})
                    text_headline = ''  # 뉴스 기사 제목 초기화
                    text_headline = text_headline + self.parser.clear_headline(
                        str(tag_headline[0].find_all(text=True)))
                    if not text_headline:  # 공백일 경우 기사 제외 처리
                        continue
                        # 기사 본문 가져옴
                    tag_content = document_content.find_all(
                        'div', {'id': 'articleBodyContents'})
                    text_sentence = ''  # 뉴스 기사 본문 초기화
                    text_sentence = text_sentence + self.parser.clear_content(
                        str(tag_content[0].find_all(text=True)))
                    if not text_sentence:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 언론사 가져옴
                    tag_company = document_content.find_all(
                        'meta', {'property': 'me2:category1'})
                    text_company = ''  # 언론사 초기화
                    text_company = text_company + str(
                        tag_company[0].get('content'))
                    if not text_company:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 이미지 가져옴
                    tag_image = document_content.find_all(
                        'span', {'class': 'end_photo_org'})
                    image_url = ''  # 이미지 초기화
                    image_url = image_url + str(
                        tag_image[0].find('img')['src'])
                    image_path = "images/" + file_name + "_" + str(
                        row) + "_" + str(news_date) + '.png'
                    urllib.request.urlretrieve(image_url, image_path)
                    row = row + 1
                    if not image_url:  # 공백일 경우 기사 제외 처리
                        continue
                    collection.insert_one({
                        "data": {
                            "headline": text_headline,
                            "content": text_sentence,
                            "company": text_company,
                            "image": image_path
                        }
                    })
                except Exception as ex:
                    pass

    def start(self):
        # MultiProcess 크롤링 시작
        for category_name in self.selected_category:
            proc = Process(target=self.crawling, args=(category_name, ))
            proc.start()
示例#21
0
class ArticleCrawler(object):
    def __init__(self):
        self.parser = ArticleParser()
        self.categories = {
            '야구': "kbaseball",
            '해외야구': "wbaseball",
            '축구': "kfootball",
            '해외축구': "wfootball",
            '농구': "basketball",
            '배구': "volleyball",
            '골프': "golf",
            '일반': "general",
            'e스포츠': "esports"
        }

        self.selected_categories = []
        self.date = {
            'start_year': 0,
            'start_month': 0,
            'start_day': 0,
            'end_year': 0,
            'end_month': 0,
            'end_day': 0
        }
        self.user_operating_system = str(platform.system())

    def set_category(self, *args):
        for key in args:
            if self.categories.get(key) is None:
                raise InvalidCategory(key)
        self.selected_categories = args

    def set_date_range(self, start_year, start_month, start_day, end_year,
                       end_month, end_day):
        args = [
            start_year, start_month, start_day, end_year, end_month, end_day
        ]

        if start_year > end_year:
            raise InvalidYear(start_year, end_year)

        if start_month < 1 or start_month > 12:
            raise InvalidMonth(start_month)
        if end_month < 1 or end_month > 12:
            raise InvalidMonth(end_month)

        if start_day < 1 or start_day > calendar.monthrange(
                start_year, start_month)[1]:
            raise InvalidDay(start_day)
        if end_day < 1 or end_day > calendar.monthrange(
                start_year, start_month)[1]:
            raise InvalidDay(end_day)

        for key, date in zip(self.date, args):
            self.date[key] = date
        print(self.date)

    def make_news_page_url(self, category_url, start_year, end_year,
                           start_month, end_month, start_day, end_day):
        total_url_list = []
        for year in range(start_year, end_year + 1):
            if start_year == end_year:
                year_start_month = start_month
                year_end_month = end_month
            else:
                if year == start_year:
                    year_start_month = start_month
                    year_end_month = 12
                elif year == end_year:
                    year_start_month = 1
                    year_end_month = end_month
                else:
                    year_start_month = 1
                    year_end_month = 12

            for month in range(year_start_month, year_end_month + 1):
                if year_start_month == year_end_month:
                    start_day_tmp = start_day
                    end_day_tmp = end_day
                else:
                    if month == year_start_month:
                        start_day_tmp = start_day
                        end_day_tmp = calendar.monthrange(year, month)[1]
                    elif month == year_end_month:
                        start_day_tmp = 1
                        end_day_tmp = end_month
                    else:
                        start_day_tmp = 1
                        end_day_tmp = calendar.monthrange(year, month)[1]

                for month_day in range(start_day_tmp, end_day_tmp + 1):
                    if len(str(month)) == 1:
                        month = "0" + str(month)
                    if len(str(month_day)) == 1:
                        month_day = "0" + str(month_day)

                    # page 날짜 정보만 있고 page 정보가 없는 url 저장
                    url = category_url + str(year) + str(month) + str(
                        month_day)

                    # totalpage는 네이버 페이지 구조를 이용해서 page=10000으로 지정해 totalpage를 알아냄
                    # page=10000을 입력할 경우 페이지가 존재하지 않기 때문에 기사 리스트의 가장 마지막 페이지 (page=totalpage)로 이동 됨
                    totalpage = 0
                    totalpage = self.parser.find_news_totalpage(url +
                                                                "&page=10000")

                    for page in range(1, totalpage + 1):
                        if totalpage:
                            total_url_list.append(url + "&page=" + str(page))

        # 월, 일의 자릿수를 2자리로 맞추기
        print_start_month = self.appendI2S(start_month)
        print_end_month = self.appendI2S(end_month)

        print_start_day = self.appendI2S(start_day)
        print_end_day = self.appendI2S(end_day)

        print('Crawling date range: ' + str(start_year) +
              str(print_start_month) + str(print_start_day) + '~' +
              str(end_year) + str(print_end_month) + str(print_end_day))
        return total_url_list

    def crawling(self, category_name):
        # MultiThread PID
        print(category_name + " PID: " + str(os.getpid()))

        # csv 파일 이름에 들어갈 month 자릿수 맞추기
        save_start_month = self.appendI2S(self.date['start_month'])
        save_end_month = self.appendI2S(self.date['end_month'])
        save_start_day = self.appendI2S(self.date['start_day'])
        save_end_day = self.appendI2S(self.date['end_day'])

        # 각 카테고리 기사 저장 할 CSV
        # Windows use euc-kr
        file = open(dataset_location + 'Article_' + category_name + '_' +
                    str(self.date['start_year']) + save_start_month +
                    save_start_day + '_' + str(self.date['end_year']) +
                    save_end_month + save_end_day + '.csv',
                    'w',
                    encoding='euc-kr',
                    newline='')

        wcsv = csv.writer(file)
        del save_start_month, save_end_month

        # 기사 리스트 URL 형식
        url = "https://sports.news.naver.com/" + str(
            self.categories.get(
                category_name)) + "/news/index.nhn?isphoto=N&date="
        print(url)
        # start_year년 start_month월 ~ end_year의 end_month 날짜까지 기사를 수집
        #url_list = self.make_news_page_url(url, self.date['start_year'], self.date['end_year'], self.date['start_month'], self.date['end_month'])
        url_list = self.make_news_page_url(url, self.date['start_year'],
                                           self.date['end_year'],
                                           self.date['start_month'],
                                           self.date['end_month'],
                                           self.date['start_day'],
                                           self.date['end_day'])

        print(category_name + " Urls are generated")
        print("The crawler starts")
        print("=========================================")
        article_count = 0

        for URL in url_list:
            regex = re.compile("date=(\d+)")
            news_date = regex.findall(URL)[0]

            driver = webdriver.Chrome('chromedriver.exe_location')

            driver.get(URL)

            html = driver.page_source  # 페이지의 html 정보 load
            bs_obj = BeautifulSoup(html, 'html.parser')

            # html의 class를 이용하여 각 페이지에 있는 기사들 가져오기
            article_url_list = bs_obj.select('.news_list .text')

            # 각 페이지에 있는 기사들의 url 저장
            post_url_list = []
            for line in article_url_list:
                post_url_list.append("https://sports.news.naver.com" +
                                     line.a.get('href')
                                     )  # 해당되는 page에서 모든 기사들의 URL을 post 리스트에 넣음
            del article_url_list

            for content_url in post_url_list:  # 기사 URL
                # 크롤링 대기 시간
                sleep(0.01)
                # 기사 HTML 가져옴
                request_content = requests.get(content_url)
                document_content = BeautifulSoup(request_content.content,
                                                 'html.parser')

                try:

                    # 기사 제목 가져옴
                    tag_headline = document_content.find_all(
                        'h4', {'class': 'title'})
                    text_headline = ''  # 뉴스 기사 제목 초기화
                    text_headline = text_headline + self.parser.clear_headline(
                        str(tag_headline[0].find_all(text=True)))
                    if not text_headline:  # 공백일 경우 기사 제외 처리
                        continue

                    # 기사 제목 가져옴
                    tag_time = document_content.find('div', {
                        'class': 'info'
                    }).find('span')

                    regex = re.compile("오[전,후]\s\d\d:\d\d")
                    match = regex.findall(str(tag_time))[0]

                    text_time = ''  # 뉴스 기사 제목 초기화
                    text_time = text_time + match

                    if not text_time:  # 공백일 경우 기사 제외 처리
                        exit()

                    # 기사 본문 가져옴
                    tag_content = document_content.find_all(
                        'div', {'id': 'newsEndContents'})
                    text_sentence = ''  # 뉴스 기사 본문 초기화
                    text_sentence = text_sentence + self.parser.clear_content(
                        str(tag_content[0].find_all(text=True)))
                    if not text_sentence:  # 공백일 경우 기사 제외 처리
                        continue

                    # CSV 작성
                    wcsv.writerow([
                        news_date, text_time, text_headline, text_sentence,
                        content_url
                    ])
                    article_count = article_count + 1

                    del text_sentence, text_headline, text_time
                    del tag_content, tag_headline
                    del request_content, document_content

                except Exception as ex:  # UnicodeEncodeError ..
                    # wcsv.writerow([ex, content_url])
                    del request_content, document_content
                    pass

        print("The crawler finished!!")
        print("Number of crawling articles : " + str(article_count))
        file.close()

    def appendI2S(self, input_int):
        if len(str(input_int)) == 1:
            out_string = "0" + str(input_int)
        else:
            out_string = str(input_int)
        return out_string

    def start(self):
        # MultiProcess 크롤링 시작
        for category_name in self.selected_categories:
            proc = Process(target=self.crawling, args=(category_name, ))
            proc.start()