示例#1
0
from bs4 import BeautifulSoup
from util import logger
import asyncio
import aiohttp
import re


log = logger.get(__name__)


@asyncio.coroutine
def grab_url(url, max_retry=5):
    text = None
    retry = False
    try:
        # todo:
        # TimeoutError: [Errno 60] Operation timed out
        # Fatal read error on socket transport
        response = yield from aiohttp.request('GET', url)
        text = yield from response.read()
        assert response.status == 200
    except (AssertionError, aiohttp.ClientOSError, aiohttp.ClientResponseError):
        yield from asyncio.sleep(6-max_retry)
        retry = True
    if retry:
        if max_retry == 0:
            raise RuntimeError('Too many attempts to download %s' % url)
        return (yield from grab_url(url, max_retry - 1))
    log.debug('Retrieved %s', url)
    return text
示例#2
0
from util.env import log_dir
from crawler import crawler
from daemon import Daemon
from util import logger
import asyncio
import sys

log = logger.get(__name__)


class CrawlerDaemon(Daemon):
    def run(self):
        loop = asyncio.get_event_loop()
        try:
            while True:
                log.info("crawler.main")
                crawler.main(loop)
        finally:
            loop.close()


if __name__ == '__main__':
    daemon = CrawlerDaemon(log_dir() + '/newsdiff_crawler.pid')
    if len(sys.argv) == 2:
        if 'start' == sys.argv[1]:
            daemon.start()
        elif 'stop' == sys.argv[1]:
            daemon.stop()
        elif 'restart' == sys.argv[1]:
            daemon.restart()
        else: