Пример #1
0
    def test_asyncio_engine(self):
        req_resp_midlleware = RequestResponseMiddleware(
            prefix_url=self.httpd.location,
            request_factory=lambda x: x,
        )

        collect_middleware = CollectRequestResponseMiddleware()

        downloader = AiohttpDownloader(
            middlewares=[AiohttpAdapterMiddleware(), collect_middleware]
        )

        downloader.middlewares.insert(0, req_resp_midlleware)

        pomp = AioPomp(
            downloader=downloader,
            pipelines=[],
        )

        class Crawler(DummyCrawler):
            ENTRY_REQUESTS = '/root'

        loop = asyncio.get_event_loop()
        loop.run_until_complete(ensure_future(pomp.pump(Crawler())))
        loop.close()

        assert_set_equal(
            set([r.url.replace(self.httpd.location, '')
                for r in collect_middleware.requests]),
            set(self.httpd.sitemap.keys())
        )
Пример #2
0
    def test_asyncio_engine_with_aio_crawler(self):
        pomp = AioPomp(
            downloader=self.downloader,
            middlewares=self.middlewares,
            pipelines=[],
        )

        loop = asyncio.get_event_loop()
        loop.run_until_complete(pomp.pump(AioCrawler()))

        assert \
            set([r.url.replace(self.httpd.location, '')
                for r in self.collect_middleware.requests]) == \
            set(self.httpd.sitemap.keys())
Пример #3
0
    def test_asyncio_engine_with_aio_crawler(self):
        pomp = AioPomp(
            downloader=self.downloader,
            middlewares=self.middlewares,
            pipelines=[],
        )

        loop = asyncio.get_event_loop()
        loop.run_until_complete(pomp.pump(AioCrawler()))

        assert \
            set([r.url.replace(self.httpd.location, '')
                for r in self.collect_middleware.requests]) == \
            set(self.httpd.sitemap.keys())
Пример #4
0
    def test_asyncio_pipelines(self):
        sync_pipeline = SyncPipeline()
        async_pipeline = AsyncPipeline()

        pomp = AioPomp(
            downloader=self.downloader,
            middlewares=self.middlewares,
            pipelines=[
                sync_pipeline,
                async_pipeline,
            ],
        )

        loop = asyncio.get_event_loop()
        loop.run_until_complete(pomp.pump(Crawler()))

        assert sync_pipeline.items_count
        assert async_pipeline.items_count
        assert sync_pipeline.items_count == async_pipeline.items_count
Пример #5
0
    def test_asyncio_pipelines(self):
        sync_pipeline = SyncPipeline()
        async_pipeline = AsyncPipeline()

        pomp = AioPomp(
            downloader=self.downloader,
            middlewares=self.middlewares,
            pipelines=[
                sync_pipeline,
                async_pipeline,
            ],
        )

        loop = asyncio.get_event_loop()
        loop.run_until_complete(pomp.pump(Crawler()))

        assert sync_pipeline.items_count
        assert async_pipeline.items_count
        assert sync_pipeline.items_count == async_pipeline.items_count
Пример #6
0
def start_crawler(loop):
    redis = yield from get_redis(loop)
    queue = RedisQueue(redis)

    # TODO only one instance of the crawler must gather queue size
    # start gather queue size metrics
    statsd = get_statsd_client()

    @asyncio.coroutine
    def _publish_queue_size_metric():
        yield from asyncio.sleep(5.0)
        statsd.gauge(
            METRIC_QUEUE_SIZE_KEY,
            (yield from queue.qsize())
        )
        asyncio.Task(_publish_queue_size_metric(), loop=loop)
    asyncio.Task(_publish_queue_size_metric(), loop=loop)

    # configure engine
    pomp = AioPomp(
        downloader=AiohttpDownloader(
            max_concurent_request_count=3,
        ),
        queue=queue,
        middlewares=(
            LogExceptionMiddleware(),
            MetricsMiddleware(),
        ),
        pipelines=(
            ItemLogPipeline(),
            KafkaPipeline(),
            MetricsPipeline(),
        ),
    )

    # start
    yield from pomp.pump(AioConcurrentCrawler(
        worker_class=CraigsListCrawler,
        pool_size=2,
    ))
    redis.close()
Пример #7
0
def start_crawler(loop):
    redis = yield from get_redis(loop)
    queue = RedisQueue(redis)

    # TODO only one instance of the crawler must gather queue size
    # start gather queue size metrics
    statsd = get_statsd_client()

    @asyncio.coroutine
    def _publish_queue_size_metric():
        yield from asyncio.sleep(5.0)
        statsd.gauge(
            METRIC_QUEUE_SIZE_KEY,
            (yield from queue.qsize())
        )
        asyncio.Task(_publish_queue_size_metric(), loop=loop)
    asyncio.Task(_publish_queue_size_metric(), loop=loop)

    # configure engine
    pomp = AioPomp(
        downloader=AiohttpDownloader(
            max_concurent_request_count=3,
        ),
        queue=queue,
        middlewares=(
            LogExceptionMiddleware(),
            MetricsMiddleware(),
        ),
        pipelines=(
            ItemLogPipeline(),
            KafkaPipeline(),
            MetricsPipeline(),
        ),
    )

    # start
    yield from pomp.pump(AioConcurrentCrawler(
        worker_class=CraigsListCrawler,
        pool_size=2,
    ))
    redis.close()
Пример #8
0
            planned = Planned()

            def build_response(f):
                planned.set_result(f.result())
            future.add_done_callback(build_response)

            yield planned


if __name__ == '__main__':
    from pomp.contrib.asynciotools import AioPomp
    from e02_dmoz import (
        PrintPipeline, DmozSpider, LXMLDownloaderMiddleware,
        StatisticMiddleware,
    )

    loop = asyncio.get_event_loop()
    statistics = StatisticMiddleware()
    pomp = AioPomp(
        downloader=AiohttpDownloader(
            middlewares=(
                statistics,
                LXMLDownloaderMiddleware(),
            ),
        ),
        pipelines=[PrintPipeline()],
    )
    loop.run_until_complete(pomp.pump(DmozSpider()))
    loop.close()
    print("Statistics:\n %s" % statistics)
Пример #9
0
        except Exception as e:
            log.exception("[AiohttpDownloader] exception on %s", request)
            return BaseCrawlException(
                request=request,
                response=None,
                exception=e,
                exc_info=sys.exc_info(),
            )


if __name__ == '__main__':
    from pomp.contrib.asynciotools import AioPomp
    from e02_quotes import (
        PrintPipeline, QuotesSpider, LXMLDownloaderMiddleware,
        StatisticMiddleware,
    )

    loop = asyncio.get_event_loop()
    statistics = StatisticMiddleware()
    pomp = AioPomp(
        downloader=AiohttpDownloader(),
        middlewares=(
            statistics,
            LXMLDownloaderMiddleware(),
        ),
        pipelines=[PrintPipeline()],
    )
    loop.run_until_complete(pomp.pump(QuotesSpider()))
    loop.close()
    print("Statistics:\n %s" % statistics)
Пример #10
0
            return BaseCrawlException(
                request=request,
                response=None,
                exception=e,
                exc_info=sys.exc_info(),
            )


if __name__ == '__main__':
    from pomp.contrib.asynciotools import AioPomp
    from e02_quotes import (
        PrintPipeline,
        QuotesSpider,
        LXMLDownloaderMiddleware,
        StatisticMiddleware,
    )

    loop = asyncio.get_event_loop()
    statistics = StatisticMiddleware()
    pomp = AioPomp(
        downloader=AiohttpDownloader(),
        middlewares=(
            statistics,
            LXMLDownloaderMiddleware(),
        ),
        pipelines=[PrintPipeline()],
    )
    loop.run_until_complete(pomp.pump(QuotesSpider()))
    loop.close()
    print("Statistics:\n %s" % statistics)