Пример #1
0
    def crawl(self, min_crawl_page, crawler, start_url,
              context) -> Stream[dict]:
        new_articles = seq([])

        for i in range(0, self.max_crawl_page):
            start_index = i * 10 + 1
            url = start_url + '&start={}'.format(start_index)
            articles = crawler.crawl_url(url)

            articles = articles.map(
                lambda a: self.post_crawl(a, context)).cache()

            if articles.size() == 0:
                continue

            curr_page_new_articles = self.data_base.filter_non_exist(articles)
            log.info(
                f'# new articles in page: {curr_page_new_articles.size()}')

            new_articles += curr_page_new_articles

            if i >= (min_crawl_page -
                     1) and curr_page_new_articles.size() == 0:
                break

        return new_articles
Пример #2
0
  def crawl_url(self, url) -> Stream[dict]:
    time.sleep(self.sleep_time + random.random())
    log.info('start to crawl from url: {}'.format(url))

    page_html = requests.get(url).text
    try:
      root_node = BeautifulSoup(page_html, 'html.parser')
      return self.parse_soup(root_node).cache()
    except:
      self.handle_parse_error(page_html, url)
Пример #3
0
    def test_append(self):
        append_test_sheet_name = 'append_test'
        create_sheet_res = self.spreadsheet_api.create_sheet(
            append_test_sheet_name)
        sheet_id = pydash.get(create_sheet_res,
                              'replies.0.addSheet.properties.sheetId')
        log.info(f'append test sheet id: {sheet_id}')

        try:
            self.spreadsheet_api.append(f'{append_test_sheet_name}!B2',
                                        [['1', '2', '3']])
            self.spreadsheet_api.append(f'{append_test_sheet_name}!B2',
                                        [['4', '5', '6']])

            r1 = self.spreadsheet_api.get(f'{append_test_sheet_name}!B2:D2')
            r2 = self.spreadsheet_api.get(f'{append_test_sheet_name}!B3:D3')
            self.assertListEqual(r1, [['1', '2', '3']])
            self.assertListEqual(r2, [['4', '5', '6']])
        finally:
            self.spreadsheet_api.delete_sheet(sheet_id)
Пример #4
0
    def __init__(self,
                 portal,
                 channel,
                 crawling_context_sheet_class=CrawlingContextSheet,
                 crawling_data_sheet_class=CrawlingDataSheet):
        context_dict = crawling_context_sheet_class().get()
        self.max_crawl_page = config.MAX_CRAWL_PAGE
        channel_key = get_channel_key(portal, channel)

        self.crawling_data_sheet_class = crawling_data_sheet_class
        self.data_base = database_factory.get(channel_key)

        log.info(
            f'Processor channel key: {channel_key}, max crawl page: {self.max_crawl_page}'
        )

        self.crawler = crawler_dict.get(channel_key)
        self.contexts = context_dict.get(channel_key)

        self.check_init()
Пример #5
0
    def process_context(self, context: dict):
        context_start_time = time.time()

        channel_key = get_channel_key(context['portal'], context['channel'])
        log.info('start to crawl channel [{}].'.format(context))

        crawler = crawler_dict[channel_key]
        data_sheet = self.crawling_data_sheet_class(context)
        start_url = context['start_url']
        new_articles = self.crawl(int(context['crawl_page']), crawler,
                                  start_url, context)
        new_articles = new_articles.distinct_by(lambda a: a['url']).sorted(
            key=lambda d: d['posted_at'])

        log.info(f'# total new articles: {new_articles.size()}')

        if new_articles.size() > 0:
            new_articles = new_articles.map(self.data_base.insert).cache()
            data_sheet.append(new_articles)

            self.send_telegram_msg(channel_key, context, new_articles)

        log.info(
            f'crawled channel key [{channel_key}] finished. # new channel: {new_articles.size()}, crawling time: {time.time() - context_start_time}'
        )
Пример #6
0
import argparse
from processor.crawl_processor import CrawlProcessor
from definitions import log
import config

parser = argparse.ArgumentParser()
parser.add_argument('--portal', help='포털 이름 (한글). ex) 네이버')
parser.add_argument('--channel', help='채널 이름 (한글). ex) 뉴스')
parser.add_argument('--max_page', type=int, required=False, help='최대 crawling 할 페이지 수', default=config.MAX_CRAWL_PAGE)
parser.add_argument('--sleep', type=int, required=False, help='페이지 crawling 마다 sleep 하는 시간 (초)', default=config.PAGE_CRAWL_TIME_GAP)
args = parser.parse_args()

if __name__ == '__main__':
  log.info(f'args: {vars(args)}')

  portal = args.portal
  channel = args.channel

  config.MAX_CRAWL_PAGE = args.max_page
  config.PAGE_CRAWL_TIME_GAP = args.sleep

  crawl_processor = CrawlProcessor(portal, channel)
  crawl_processor.start()