Exemplo n.º 1
0
 def start(self):
     self._seen = set()
     self._queue = Queue(loop=self._loop)
     for url in self.config['urls']:
         self._queue.put_nowait(Request(url))
     workers = [asyncio.Task(self._work()) for _ in range(self.config['concurrency'])]
     yield from self._queue.join()
     for worker in workers:
         worker.cancel()
Exemplo n.º 2
0
class Spider(metaclass=abc.ABCMeta):

    def __init__(self, middlewares=None, loop=None, **config):
        self.config = config
        self._context = {}
        self._loop = loop or asyncio.get_event_loop()
        self._connector = aiohttp.TCPConnector(loop=self._loop)
        self._middlewares = CrawlerMiddlewareManager(self, middlewares)

    def enqueue_request(self, **kwargs):
        context = self._context[self.task]
        max_depth = self.config.get('max_depth')
        if max_depth and context['request'].depth > max_depth:
            return
        request = Request(referer=context['response'], **kwargs)
        if request.url in self._seen:
            return
        if not self._url_allowed(request):
            return
        request.depth = context['response'].request.depth + 1
        self._queue.put_nowait(request)

    def _url_allowed(self, request):
        return next(
            (
                True for domain in self.config['domains']
                if request.furl.host.endswith(domain)
            ),
            False,
        )

    @asyncio.coroutine
    def start(self):
        self._seen = set()
        self._queue = Queue(loop=self._loop)
        for url in self.config['urls']:
            self._queue.put_nowait(Request(url))
        workers = [asyncio.Task(self._work()) for _ in range(self.config['concurrency'])]
        yield from self._queue.join()
        for worker in workers:
            worker.cancel()

    @asyncio.coroutine
    def _work(self):
        while True:
            request = yield from self._queue.get()
            yield from self._fetch(request)
            self._queue.task_done()

    @asyncio.coroutine
    def _fetch(self, request):
        for callback in self._middlewares['before_request']:
            request = callback(request)
        resp = yield from aiohttp.request('GET', request.url, loop=self._loop)
        body = yield from resp.read_and_close()
        response = Response(request, resp, body)
        for callback in self._middlewares['after_response']:
            response = callback(response)
        with self._request_context(self._loop, request, response):
            self.parse(response)

    @property
    def _task(self):
        return asyncio.get_current_task(loop=self._loop)

    @contextlib.contextmanager
    def _request_context(self, request, response):
        self._context[self.task] = {'request': request, 'response': response}
        try:
            yield
        finally:
            del self._context[self.task]

    @abc.abstractmethod
    def parse(self, response):
        pass
Exemplo n.º 3
0
class Spider(metaclass=abc.ABCMeta):

    def __init__(self, import_name, middlewares=None, loop=None, session=None):
        self.import_name = import_name
        self.root_path = get_root_path(import_name)
        self.config = Config(self.root_path, default_config)

        self._context = {}
        self._loop = loop or asyncio.get_event_loop()
        self._middlewares = SpiderMiddlewareManager(self, middlewares)
        self._session = session or aiohttp.ClientSession(loop=self._loop)

    def enqueue_request(self, **kwargs):
        context = self._context[self._task]
        max_depth = self.config.get('MAX_DEPTH')
        if max_depth and context['request'].depth > max_depth:
            return
        request = Request(referer=context['response'], **kwargs)
        if request.url in self._seen:
            return
        if not self._url_allowed(request):
            return
        request.depth = context['response'].request.depth + 1
        self._queue.put_nowait(request)

    def run(self):
        self._loop.run_until_complete(self.start())

    @asyncio.coroutine
    def start(self):
        self._seen = set()
        self._queue = Queue(loop=self._loop)
        for url in self.config['URLS']:
            self._queue.put_nowait(Request(url))
        workers = [asyncio.Task(self._work()) for _ in range(self.config['CONCURRENCY'])]
        yield from self._queue.join()
        for worker in workers:
            worker.cancel()

    @asyncio.coroutine
    def _work(self):
        while True:
            request = yield from self._queue.get()
            yield from self._fetch(request)
            self._queue.task_done()

    @asyncio.coroutine
    def _fetch(self, request):
        for callback in self._middlewares['before_request']:
            request = callback(self, request)
        resp = yield from self._session.request('get', **request.params)
        body = yield from resp.read_and_close()
        response = Response(request, resp, body)
        for callback in self._middlewares['after_response']:
            response = callback(self, response)
        with self._request_context(request, response):
            self.parse(response)

    @property
    def _task(self):
        return asyncio.Task.current_task(loop=self._loop)

    @contextlib.contextmanager
    def _request_context(self, request, response):
        self._context[self._task] = {'request': request, 'response': response}
        try:
            yield
        finally:
            del self._context[self._task]

    def _url_allowed(self, request):
        return next(
            (
                True for domain in self.config['DOMAINS']
                if request.furl.host.endswith(domain)
            ),
            False,
        )

    @abc.abstractmethod
    def parse(self, response):
        pass