def test_asyncio_engine(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = AiohttpDownloader( middlewares=[AiohttpAdapterMiddleware(), collect_middleware] ) downloader.middlewares.insert(0, req_resp_midlleware) pomp = AioPomp( downloader=downloader, pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' loop = asyncio.get_event_loop() loop.run_until_complete(ensure_future(pomp.pump(Crawler()))) loop.close() assert_set_equal( set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]), set(self.httpd.sitemap.keys()) )
def test_asyncio_engine_with_aio_crawler(self): pomp = AioPomp( downloader=self.downloader, middlewares=self.middlewares, pipelines=[], ) loop = asyncio.get_event_loop() loop.run_until_complete(pomp.pump(AioCrawler())) assert \ set([r.url.replace(self.httpd.location, '') for r in self.collect_middleware.requests]) == \ set(self.httpd.sitemap.keys())
def test_asyncio_pipelines(self): sync_pipeline = SyncPipeline() async_pipeline = AsyncPipeline() pomp = AioPomp( downloader=self.downloader, middlewares=self.middlewares, pipelines=[ sync_pipeline, async_pipeline, ], ) loop = asyncio.get_event_loop() loop.run_until_complete(pomp.pump(Crawler())) assert sync_pipeline.items_count assert async_pipeline.items_count assert sync_pipeline.items_count == async_pipeline.items_count
def start_crawler(loop): redis = yield from get_redis(loop) queue = RedisQueue(redis) # TODO only one instance of the crawler must gather queue size # start gather queue size metrics statsd = get_statsd_client() @asyncio.coroutine def _publish_queue_size_metric(): yield from asyncio.sleep(5.0) statsd.gauge( METRIC_QUEUE_SIZE_KEY, (yield from queue.qsize()) ) asyncio.Task(_publish_queue_size_metric(), loop=loop) asyncio.Task(_publish_queue_size_metric(), loop=loop) # configure engine pomp = AioPomp( downloader=AiohttpDownloader( max_concurent_request_count=3, ), queue=queue, middlewares=( LogExceptionMiddleware(), MetricsMiddleware(), ), pipelines=( ItemLogPipeline(), KafkaPipeline(), MetricsPipeline(), ), ) # start yield from pomp.pump(AioConcurrentCrawler( worker_class=CraigsListCrawler, pool_size=2, )) redis.close()
planned = Planned() def build_response(f): planned.set_result(f.result()) future.add_done_callback(build_response) yield planned if __name__ == '__main__': from pomp.contrib.asynciotools import AioPomp from e02_dmoz import ( PrintPipeline, DmozSpider, LXMLDownloaderMiddleware, StatisticMiddleware, ) loop = asyncio.get_event_loop() statistics = StatisticMiddleware() pomp = AioPomp( downloader=AiohttpDownloader( middlewares=( statistics, LXMLDownloaderMiddleware(), ), ), pipelines=[PrintPipeline()], ) loop.run_until_complete(pomp.pump(DmozSpider())) loop.close() print("Statistics:\n %s" % statistics)
except Exception as e: log.exception("[AiohttpDownloader] exception on %s", request) return BaseCrawlException( request=request, response=None, exception=e, exc_info=sys.exc_info(), ) if __name__ == '__main__': from pomp.contrib.asynciotools import AioPomp from e02_quotes import ( PrintPipeline, QuotesSpider, LXMLDownloaderMiddleware, StatisticMiddleware, ) loop = asyncio.get_event_loop() statistics = StatisticMiddleware() pomp = AioPomp( downloader=AiohttpDownloader(), middlewares=( statistics, LXMLDownloaderMiddleware(), ), pipelines=[PrintPipeline()], ) loop.run_until_complete(pomp.pump(QuotesSpider())) loop.close() print("Statistics:\n %s" % statistics)
return BaseCrawlException( request=request, response=None, exception=e, exc_info=sys.exc_info(), ) if __name__ == '__main__': from pomp.contrib.asynciotools import AioPomp from e02_quotes import ( PrintPipeline, QuotesSpider, LXMLDownloaderMiddleware, StatisticMiddleware, ) loop = asyncio.get_event_loop() statistics = StatisticMiddleware() pomp = AioPomp( downloader=AiohttpDownloader(), middlewares=( statistics, LXMLDownloaderMiddleware(), ), pipelines=[PrintPipeline()], ) loop.run_until_complete(pomp.pump(QuotesSpider())) loop.close() print("Statistics:\n %s" % statistics)