def crawl(self, min_crawl_page, crawler, start_url, context) -> Stream[dict]: new_articles = seq([]) for i in range(0, self.max_crawl_page): start_index = i * 10 + 1 url = start_url + '&start={}'.format(start_index) articles = crawler.crawl_url(url) articles = articles.map( lambda a: self.post_crawl(a, context)).cache() if articles.size() == 0: continue curr_page_new_articles = self.data_base.filter_non_exist(articles) log.info( f'# new articles in page: {curr_page_new_articles.size()}') new_articles += curr_page_new_articles if i >= (min_crawl_page - 1) and curr_page_new_articles.size() == 0: break return new_articles
def crawl_url(self, url) -> Stream[dict]: time.sleep(self.sleep_time + random.random()) log.info('start to crawl from url: {}'.format(url)) page_html = requests.get(url).text try: root_node = BeautifulSoup(page_html, 'html.parser') return self.parse_soup(root_node).cache() except: self.handle_parse_error(page_html, url)
def test_append(self): append_test_sheet_name = 'append_test' create_sheet_res = self.spreadsheet_api.create_sheet( append_test_sheet_name) sheet_id = pydash.get(create_sheet_res, 'replies.0.addSheet.properties.sheetId') log.info(f'append test sheet id: {sheet_id}') try: self.spreadsheet_api.append(f'{append_test_sheet_name}!B2', [['1', '2', '3']]) self.spreadsheet_api.append(f'{append_test_sheet_name}!B2', [['4', '5', '6']]) r1 = self.spreadsheet_api.get(f'{append_test_sheet_name}!B2:D2') r2 = self.spreadsheet_api.get(f'{append_test_sheet_name}!B3:D3') self.assertListEqual(r1, [['1', '2', '3']]) self.assertListEqual(r2, [['4', '5', '6']]) finally: self.spreadsheet_api.delete_sheet(sheet_id)
def __init__(self, portal, channel, crawling_context_sheet_class=CrawlingContextSheet, crawling_data_sheet_class=CrawlingDataSheet): context_dict = crawling_context_sheet_class().get() self.max_crawl_page = config.MAX_CRAWL_PAGE channel_key = get_channel_key(portal, channel) self.crawling_data_sheet_class = crawling_data_sheet_class self.data_base = database_factory.get(channel_key) log.info( f'Processor channel key: {channel_key}, max crawl page: {self.max_crawl_page}' ) self.crawler = crawler_dict.get(channel_key) self.contexts = context_dict.get(channel_key) self.check_init()
def process_context(self, context: dict): context_start_time = time.time() channel_key = get_channel_key(context['portal'], context['channel']) log.info('start to crawl channel [{}].'.format(context)) crawler = crawler_dict[channel_key] data_sheet = self.crawling_data_sheet_class(context) start_url = context['start_url'] new_articles = self.crawl(int(context['crawl_page']), crawler, start_url, context) new_articles = new_articles.distinct_by(lambda a: a['url']).sorted( key=lambda d: d['posted_at']) log.info(f'# total new articles: {new_articles.size()}') if new_articles.size() > 0: new_articles = new_articles.map(self.data_base.insert).cache() data_sheet.append(new_articles) self.send_telegram_msg(channel_key, context, new_articles) log.info( f'crawled channel key [{channel_key}] finished. # new channel: {new_articles.size()}, crawling time: {time.time() - context_start_time}' )
import argparse from processor.crawl_processor import CrawlProcessor from definitions import log import config parser = argparse.ArgumentParser() parser.add_argument('--portal', help='포털 이름 (한글). ex) 네이버') parser.add_argument('--channel', help='채널 이름 (한글). ex) 뉴스') parser.add_argument('--max_page', type=int, required=False, help='최대 crawling 할 페이지 수', default=config.MAX_CRAWL_PAGE) parser.add_argument('--sleep', type=int, required=False, help='페이지 crawling 마다 sleep 하는 시간 (초)', default=config.PAGE_CRAWL_TIME_GAP) args = parser.parse_args() if __name__ == '__main__': log.info(f'args: {vars(args)}') portal = args.portal channel = args.channel config.MAX_CRAWL_PAGE = args.max_page config.PAGE_CRAWL_TIME_GAP = args.sleep crawl_processor = CrawlProcessor(portal, channel) crawl_processor.start()