from bs4 import BeautifulSoup from util import logger import asyncio import aiohttp import re log = logger.get(__name__) @asyncio.coroutine def grab_url(url, max_retry=5): text = None retry = False try: # todo: # TimeoutError: [Errno 60] Operation timed out # Fatal read error on socket transport response = yield from aiohttp.request('GET', url) text = yield from response.read() assert response.status == 200 except (AssertionError, aiohttp.ClientOSError, aiohttp.ClientResponseError): yield from asyncio.sleep(6-max_retry) retry = True if retry: if max_retry == 0: raise RuntimeError('Too many attempts to download %s' % url) return (yield from grab_url(url, max_retry - 1)) log.debug('Retrieved %s', url) return text
from util.env import log_dir from crawler import crawler from daemon import Daemon from util import logger import asyncio import sys log = logger.get(__name__) class CrawlerDaemon(Daemon): def run(self): loop = asyncio.get_event_loop() try: while True: log.info("crawler.main") crawler.main(loop) finally: loop.close() if __name__ == '__main__': daemon = CrawlerDaemon(log_dir() + '/newsdiff_crawler.pid') if len(sys.argv) == 2: if 'start' == sys.argv[1]: daemon.start() elif 'stop' == sys.argv[1]: daemon.stop() elif 'restart' == sys.argv[1]: daemon.restart() else: