示例#1
0
    def test_crawler_dive_methods(self):
        road = RoadPipeline()

        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
            pipelines=[
                road,
            ],
        )

        # Depth first method
        pomp.pump(Crawler())

        assert_equal(set([item.url for item in road.collection]), set([
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ]))

        # Width first method
        road.reset()

        class DummyWidthCrawler(Crawler):
            CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD

        pomp.pump(DummyWidthCrawler())

        assert_equal(set([item.url for item in road.collection]), set([
            'http://python.org/1',
            'http://python.org/2',
            'http://python.org/1/trash',
        ]))
示例#2
0
    def test_dive_methods(self, crawler_class=None):
        crawler_class = crawler_class or Crawler
        road = RoadPipeline()

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
            breadth_first=False,
        )

        # Depth first method
        pomp.pump(crawler_class())

        log.debug("in road %s", [item.url for item in road.collection])
        assert [item.url for item in road.collection] == [
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ]

        # Width first method
        road.reset()

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
            breadth_first=True,
        )
        pomp.pump(crawler_class())

        log.debug("in road %s", [item.url for item in road.collection])
        assert [item.url for item in road.collection] == [
            'http://python.org/1',
            'http://python.org/2',
            'http://python.org/1/trash',
        ]
示例#3
0
def test_exception_on_processing_response():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(downloader=DummyDownloader(middlewares=[
        RaiseOnResponseMiddleware(),
        collect_middleware,
    ]), )

    pomp.pump(Crawler())

    assert_equal(len(collect_middleware.exceptions), 1)
    assert_equal(len(collect_middleware.requests), 1)
    assert_equal(len(collect_middleware.responses), 1)
示例#4
0
def test_exception_on_processing_response_callback():
    class CrawlerWithException(Crawler):
        def extract_items(self, *args, **kwargs):
            raise Exception("some exception on extract items")

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(downloader=DummyDownloader(),
                middlewares=(collect_middleware, ))

    pomp.pump(CrawlerWithException())

    assert len(collect_middleware.exceptions) == 1
    assert len(collect_middleware.requests) == 1
    assert len(collect_middleware.responses) == 1
示例#5
0
    def test_pipeline_exception(self):
        class PipelineWithException(BasePipeline):
            def process(self, crawler, item):
                raise RuntimeError("some exception")

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                PipelineWithException(),
            ],
        )

        pomp.pump(Crawler())
示例#6
0
    def test_crawler_return_none(self):
        class CrawlerWithoutItems(BaseCrawler):
            ENTRY_REQUESTS = 'http://localhost/'

            def extract_items(self, *args, **kwargs):
                pass

            def next_requests(self, *args, **kwargs):
                pass

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
        )
        pomp.pump(CrawlerWithoutItems())
示例#7
0
def test_exception_on_processing_request():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RaiseOnRequestMiddleware(),
            collect_middleware,
        ),
    )

    pomp.pump(Crawler())

    assert len(collect_middleware.exceptions) == 1
    assert len(collect_middleware.requests) == 0
    assert len(collect_middleware.responses) == 0
示例#8
0
def test_exception_on_processing_done():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RequestResponseMiddleware(request_factory=DummyRequest,
                                      bodyjson=False),
            collect_middleware,
        ),
    )

    pomp.pump(RaiseOnProcessingDoneCralwer())

    # one exception on request middleware plus one on exception processing
    assert len(collect_middleware.exceptions) == 1
    assert len(collect_middleware.requests) == 1
    assert len(collect_middleware.responses) == 1
示例#9
0
def test_exception_on_processing_exception():

    collect_middleware = CollectRequestResponseMiddleware()
    pomp = Pomp(
        downloader=DummyDownloader(),
        middlewares=(
            RaiseOnRequestMiddleware(),
            RaiseOnExceptionMiddleware(),
            collect_middleware,
        ),
    )

    pomp.pump(Crawler())

    # one exception on request middleware plus one on exception processing
    assert len(collect_middleware.exceptions) == 1 + 1
    assert len(collect_middleware.requests) == 0
    assert len(collect_middleware.responses) == 0
示例#10
0
    def test_crawler_without_next_request_method_result(self):
        class CrawlerWithoutNextRequestMethod(Crawler):
            def next_requests(self, *args, **kwargs):
                pass

        road = RoadPipeline()
        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
        )
        pomp.pump(CrawlerWithoutNextRequestMethod())
        assert set([item.url for item in road.collection]) == set([
            'http://python.org/1',
            'http://python.org/1/trash',
        ])
示例#11
0
    def test_pipeline(self):

        class IncPipeline(BasePipeline):

            def process(self, crawler, item):
                item.value += 1
                return item

        class FilterPipeline(BasePipeline):

            def process(self, crawler, item):
                if 'trash' in item.url:
                    return None
                return item

        class SavePipeline(BasePipeline):

            def __init__(self, collection):
                self.collection = collection

            def process(self, crawler, item):
                self.collection.append(item)
                return item

        result = []

        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
            pipelines=[
                IncPipeline(),
                FilterPipeline(),
                SavePipeline(result),
            ],
        )

        pomp.pump(Crawler())

        assert_equal([(item.url, item.value) for item in result], [
            ('http://python.org/1', 2),
            ('http://python.org/2', 2),
        ])
示例#12
0
    def test_queue_crawler(self):
        road = RoadPipeline()

        class SimpleQueue(BaseQueue):

            def __init__(self):
                self.requests = []

            def get_requests(self):
                try:
                    return self.requests.pop()
                except IndexError:
                    return  # empty queue

            def put_requests(self, request):
                self.requests.append(request)

        queue = SimpleQueue()

        pomp = Pomp(
            downloader=DummyDownloader(middlewares=[url_to_request_middl]),
            pipelines=[
                road,
            ],
            queue=queue,
        )

        class DummyWidthCrawler(Crawler):
            CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD

        pomp.pump(DummyWidthCrawler())

        assert_equal(set([item.url for item in road.collection]), set([
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ]))
示例#13
0
    def test_queue_crawler(self):
        road = RoadPipeline()

        class SimpleQueue(BaseQueue):
            def __init__(self):
                self.requests = []

            def get_requests(self, count=None):
                # because downloader without workers
                assert count is None
                try:
                    return self.requests.pop()
                except IndexError:
                    return  # empty queue

            def put_requests(self, request):
                self.requests.append(request)

        pomp = Pomp(
            downloader=DummyDownloader(),
            middlewares=[url_to_request_middl],
            pipelines=[
                road,
            ],
        )

        # override internal queue with own
        pomp.queue = SimpleQueue()

        pomp.pump(Crawler())

        assert set([item.url for item in road.collection]) == set([
            'http://python.org/1',
            'http://python.org/1/trash',
            'http://python.org/2',
        ])