def test_crawler_dive_methods(self): road = RoadPipeline() pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), pipelines=[ road, ], ) # Depth first method pomp.pump(Crawler()) assert_equal(set([item.url for item in road.collection]), set([ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ])) # Width first method road.reset() class DummyWidthCrawler(Crawler): CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD pomp.pump(DummyWidthCrawler()) assert_equal(set([item.url for item in road.collection]), set([ 'http://python.org/1', 'http://python.org/2', 'http://python.org/1/trash', ]))
def test_dive_methods(self, crawler_class=None): crawler_class = crawler_class or Crawler road = RoadPipeline() pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ road, ], breadth_first=False, ) # Depth first method pomp.pump(crawler_class()) log.debug("in road %s", [item.url for item in road.collection]) assert [item.url for item in road.collection] == [ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ] # Width first method road.reset() pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ road, ], breadth_first=True, ) pomp.pump(crawler_class()) log.debug("in road %s", [item.url for item in road.collection]) assert [item.url for item in road.collection] == [ 'http://python.org/1', 'http://python.org/2', 'http://python.org/1/trash', ]
def test_exception_on_processing_response(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp(downloader=DummyDownloader(middlewares=[ RaiseOnResponseMiddleware(), collect_middleware, ]), ) pomp.pump(Crawler()) assert_equal(len(collect_middleware.exceptions), 1) assert_equal(len(collect_middleware.requests), 1) assert_equal(len(collect_middleware.responses), 1)
def test_exception_on_processing_response_callback(): class CrawlerWithException(Crawler): def extract_items(self, *args, **kwargs): raise Exception("some exception on extract items") collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp(downloader=DummyDownloader(), middlewares=(collect_middleware, )) pomp.pump(CrawlerWithException()) assert len(collect_middleware.exceptions) == 1 assert len(collect_middleware.requests) == 1 assert len(collect_middleware.responses) == 1
def test_pipeline_exception(self): class PipelineWithException(BasePipeline): def process(self, crawler, item): raise RuntimeError("some exception") pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ PipelineWithException(), ], ) pomp.pump(Crawler())
def test_crawler_return_none(self): class CrawlerWithoutItems(BaseCrawler): ENTRY_REQUESTS = 'http://localhost/' def extract_items(self, *args, **kwargs): pass def next_requests(self, *args, **kwargs): pass pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], ) pomp.pump(CrawlerWithoutItems())
def test_exception_on_processing_request(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp( downloader=DummyDownloader(), middlewares=( RaiseOnRequestMiddleware(), collect_middleware, ), ) pomp.pump(Crawler()) assert len(collect_middleware.exceptions) == 1 assert len(collect_middleware.requests) == 0 assert len(collect_middleware.responses) == 0
def test_exception_on_processing_done(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp( downloader=DummyDownloader(), middlewares=( RequestResponseMiddleware(request_factory=DummyRequest, bodyjson=False), collect_middleware, ), ) pomp.pump(RaiseOnProcessingDoneCralwer()) # one exception on request middleware plus one on exception processing assert len(collect_middleware.exceptions) == 1 assert len(collect_middleware.requests) == 1 assert len(collect_middleware.responses) == 1
def test_exception_on_processing_exception(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp( downloader=DummyDownloader(), middlewares=( RaiseOnRequestMiddleware(), RaiseOnExceptionMiddleware(), collect_middleware, ), ) pomp.pump(Crawler()) # one exception on request middleware plus one on exception processing assert len(collect_middleware.exceptions) == 1 + 1 assert len(collect_middleware.requests) == 0 assert len(collect_middleware.responses) == 0
def test_crawler_without_next_request_method_result(self): class CrawlerWithoutNextRequestMethod(Crawler): def next_requests(self, *args, **kwargs): pass road = RoadPipeline() pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ road, ], ) pomp.pump(CrawlerWithoutNextRequestMethod()) assert set([item.url for item in road.collection]) == set([ 'http://python.org/1', 'http://python.org/1/trash', ])
def test_pipeline(self): class IncPipeline(BasePipeline): def process(self, crawler, item): item.value += 1 return item class FilterPipeline(BasePipeline): def process(self, crawler, item): if 'trash' in item.url: return None return item class SavePipeline(BasePipeline): def __init__(self, collection): self.collection = collection def process(self, crawler, item): self.collection.append(item) return item result = [] pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), pipelines=[ IncPipeline(), FilterPipeline(), SavePipeline(result), ], ) pomp.pump(Crawler()) assert_equal([(item.url, item.value) for item in result], [ ('http://python.org/1', 2), ('http://python.org/2', 2), ])
def test_queue_crawler(self): road = RoadPipeline() class SimpleQueue(BaseQueue): def __init__(self): self.requests = [] def get_requests(self): try: return self.requests.pop() except IndexError: return # empty queue def put_requests(self, request): self.requests.append(request) queue = SimpleQueue() pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), pipelines=[ road, ], queue=queue, ) class DummyWidthCrawler(Crawler): CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD pomp.pump(DummyWidthCrawler()) assert_equal(set([item.url for item in road.collection]), set([ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ]))
def test_queue_crawler(self): road = RoadPipeline() class SimpleQueue(BaseQueue): def __init__(self): self.requests = [] def get_requests(self, count=None): # because downloader without workers assert count is None try: return self.requests.pop() except IndexError: return # empty queue def put_requests(self, request): self.requests.append(request) pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ road, ], ) # override internal queue with own pomp.queue = SimpleQueue() pomp.pump(Crawler()) assert set([item.url for item in road.collection]) == set([ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ])