def test_queue_get_requests_with_count(self): class DummyDownloaderWithWorkers(DummyDownloader): def get_workers_count(self): return 5 class SimpleQueue(BaseQueue): def __init__(self): self.requests = [] def get_requests(self, count=None): # Downloader can fetch only one request at moment assert count == 5 try: return self.requests.pop() except IndexError: return # empty queue def put_requests(self, request): self.requests.append(request) pomp = Pomp( downloader=DummyDownloaderWithWorkers(), middlewares=(url_to_request_middl, ), ) # override internal queue with own pomp.queue = SimpleQueue() pomp.pump(Crawler())
def test_urllib_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = UrllibDownloader() pomp = Pomp( downloader=downloader, middlewares=( req_resp_midlleware, UrllibAdapterMiddleware(), collect_middleware, ), pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert \ set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]) == \ set(self.httpd.sitemap.keys())
def test_exception_on_crawler_worker(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentUrllibDownloader( pool_size=2, ) pomp = Pomp( downloader=downloader, middlewares=( req_resp_midlleware, UrllibAdapterMiddleware(), collect_middleware, ), pipelines=[], ) pomp.pump(ConcurrentCrawler( pool_size=2, worker_class=MockedCrawlerWorkerWithException, )) assert len(collect_middleware.requests) == 1 assert len(collect_middleware.exceptions) == 1
def do_simple_test(self, queue=None): req_resp_middleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=TwistedHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = TwistedDownloader( reactor, middlewares=[collect_middleware]) downloader.middlewares.insert(0, req_resp_middleware) pomp = Pomp( downloader=downloader, pipelines=[PrintPipeline()], queue=queue, ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' done_defer = defer.Deferred() d = pomp.pump(Crawler()) d.add_callback(done_defer.callback) def check(x): assert_set_equal( set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]), set(self.httpd.sitemap.keys()) ) done_defer.addCallback(check) return done_defer
def test_thread_pooled_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ThreadedDownloader( middlewares=[UrllibAdapterMiddleware(), collect_middleware]) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp( downloader=downloader, pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert_set_equal( set([ r.url.replace(self.httpd.location, '') for r in collect_middleware.requests ]), set(self.httpd.sitemap.keys()))
def test_concurrent_urllib_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=UrllibHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentUrllibDownloader( middlewares=[collect_middleware] ) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp( downloader=downloader, pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert_set_equal( set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]), set(self.httpd.sitemap.keys()) )
def test_exception_on_downloader_worker(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url='http://localhost', request_factory=UrllibHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentDownloader( pool_size=5, worker_class=MockedDownloadWorkerWithException, worker_kwargs=None, ) pomp = Pomp( downloader=downloader, middlewares=(req_resp_midlleware, collect_middleware, ), pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert len(collect_middleware.requests) == 1 assert len(collect_middleware.exceptions) == 1
def test_concurrent_crawler(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentUrllibDownloader( pool_size=2, ) pomp = Pomp( downloader=downloader, middlewares=( req_resp_midlleware, UrllibAdapterMiddleware(), collect_middleware, ), pipelines=[], ) pomp.pump(ConcurrentCrawler( pool_size=2, worker_class=MockedCrawlerWorker, )) assert \ set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]) == \ set(self.httpd.sitemap.keys())
def test_concurrent_urllib_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=UrllibHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentUrllibDownloader( middlewares=[collect_middleware]) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp( downloader=downloader, pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert_set_equal( set([ r.url.replace(self.httpd.location, '') for r in collect_middleware.requests ]), set(self.httpd.sitemap.keys()))
def test_concurrent_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url='http://localhost', request_factory=UrllibHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentDownloader( pool_size=5, worker_class=MockedDownloadWorker, worker_kwargs=None, ) pomp = Pomp( downloader=downloader, middlewares=(req_resp_midlleware, collect_middleware, ), pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert \ set([r.url.replace('http://localhost', '') for r in collect_middleware.requests]) == \ set(MockedDownloadWorker.sitemap.keys())
def test_thread_pooled_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=lambda x: x, ) collect_middleware = CollectRequestResponseMiddleware() downloader = ThreadedDownloader( middlewares=[UrllibAdapterMiddleware(), collect_middleware] ) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp( downloader=downloader, pipelines=[], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' pomp.pump(Crawler()) assert_set_equal( set([r.url.replace(self.httpd.location, '') for r in collect_middleware.requests]), set(self.httpd.sitemap.keys()) )
def test_exception_handling(self): class CatchException(BaseMiddleware): def __init__(self): self.exceptions = [] def process_exception(self, exception, crawler, downloader): self.exceptions.append(exception) return exception class MockCrawler(BaseCrawler): def next_requests(self, response): return def extract_items(self, response): return catch_exception_middleware = CatchException() pomp = Pomp( downloader=UrllibDownloader(), middlewares=( UrllibAdapterMiddleware(), catch_exception_middleware, ), pipelines=[], ) MockCrawler.ENTRY_REQUESTS = [ 'https://123.456.789.01:8081/fake_url', '%s/root' % self.httpd.location, ] pomp.pump(MockCrawler()) assert len(catch_exception_middleware.exceptions) == 1
def test_exceptions(self): req_resp_middleware = RequestResponseMiddleware( prefix_url='invalid url', request_factory=TwistedHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = TwistedDownloader( reactor, middlewares=[collect_middleware]) downloader.middlewares.insert(0, req_resp_middleware) pomp = Pomp( downloader=downloader, pipelines=[PrintPipeline()], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/root' done_defer = defer.Deferred() d = pomp.pump(Crawler()) d.add_callback(done_defer.callback) def check(x): assert len(collect_middleware.exceptions) == 1 assert isinstance( collect_middleware.exceptions[0], BaseDownloadException) done_defer.addCallback(check) return done_defer
def test_queue_crawler(self): road = RoadPipeline() class SimpleQueue(BaseQueue): def __init__(self): self.requests = [] def get_requests(self): try: return self.requests.pop() except IndexError: return # empty queue def put_requests(self, request): self.requests.append(request) queue = SimpleQueue() pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), pipelines=[ road, ], queue=queue, ) pomp.pump(Crawler()) assert_equal(set([item.url for item in road.collection]), set([ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ]))
def test_exception_on_processing_response(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp(downloader=DummyDownloader(middlewares=[RaiseOnResponseMiddleware(), collect_middleware])) pomp.pump(Crawler()) assert_equal(len(collect_middleware.exceptions), 1) assert_equal(len(collect_middleware.requests), 1) assert_equal(len(collect_middleware.responses), 1)
def crawler_worker(crawler_class, source_queue, stop_event): pid = os.getpid() log.debug('Start crawler worker: %s', pid) pomp = Pomp( downloader=UrllibDownloader(timeout=3), pipelines=[], queue=WrappedQueue(source_queue, stop_event), ) pomp.pump(crawler_class()) log.debug('Stop crawler worker: %s', pid) return True
def crawler_worker(crawler_class, source_queue, stop_event): pid = os.getpid() log.debug('Start crawler worker: %s', pid) pomp = Pomp( downloader=dnl(timeout=3), pipelines=[], queue=WrappedQueue(source_queue, stop_event), ) pomp.pump(crawler_class()) log.debug('Stop crawler worker: %s', pid) return True
def test_exception_on_processing_response_callback(): class CrawlerWithException(Crawler): def extract_items(self, *args, **kwargs): raise Exception("some exception on extract items") collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp(downloader=DummyDownloader(middlewares=[collect_middleware])) pomp.pump(CrawlerWithException()) assert_equal(len(collect_middleware.exceptions), 1) assert_equal(len(collect_middleware.requests), 1) assert_equal(len(collect_middleware.responses), 1)
def test_exception_on_processing_response(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp(downloader=DummyDownloader(middlewares=[ RaiseOnResponseMiddleware(), collect_middleware, ]), ) pomp.pump(Crawler()) assert_equal(len(collect_middleware.exceptions), 1) assert_equal(len(collect_middleware.requests), 1) assert_equal(len(collect_middleware.responses), 1)
def test_pipeline_exception(self): class PipelineWithException(BasePipeline): def process(self, crawler, item): raise RuntimeError("some exception") pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ PipelineWithException(), ], ) pomp.pump(Crawler())
def test_exception_on_processing_response_callback(): class CrawlerWithException(Crawler): def extract_items(self, *args, **kwargs): raise Exception("some exception on extract items") collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp(downloader=DummyDownloader(), middlewares=(collect_middleware, )) pomp.pump(CrawlerWithException()) assert len(collect_middleware.exceptions) == 1 assert len(collect_middleware.requests) == 1 assert len(collect_middleware.responses) == 1
def test_crawler_return_none(self): class CrawlerWithoutItems(BaseCrawler): ENTRY_REQUESTS = 'http://localhost/' def extract_items(self, *args, **kwargs): pass def next_requests(self, *args, **kwargs): pass pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], ) pomp.pump(CrawlerWithoutItems())
def test_crawler_return_none(self): class CrawlerWithoutItems(BaseCrawler): ENTRY_REQUESTS = 'http://localhost/' def extract_items(self, *args, **kwargs): pass def next_requests(self, *args, **kwargs): pass pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), ) pomp.pump(CrawlerWithoutItems())
def test_exception_on_processing_request(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp( downloader=DummyDownloader(), middlewares=( RaiseOnRequestMiddleware(), collect_middleware, ), ) pomp.pump(Crawler()) assert len(collect_middleware.exceptions) == 1 assert len(collect_middleware.requests) == 0 assert len(collect_middleware.responses) == 0
def test_crawler_without_next_request_method_result(self): class CrawlerWithoutNextRequestMethod(Crawler): def next_requests(self, *args, **kwargs): pass road = RoadPipeline() pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), pipelines=[ road, ], ) pomp.pump(CrawlerWithoutNextRequestMethod()) assert_equal(set([item.url for item in road.collection]), set([ 'http://python.org/1', 'http://python.org/1/trash', ]))
def test_exception_on_processing_exception(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp( downloader=DummyDownloader(), middlewares=( RaiseOnRequestMiddleware(), RaiseOnExceptionMiddleware(), collect_middleware, ), ) pomp.pump(Crawler()) # one exception on request middleware plus one on exception processing assert len(collect_middleware.exceptions) == 1 + 1 assert len(collect_middleware.requests) == 0 assert len(collect_middleware.responses) == 0
def test_exception_on_processing_done(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp( downloader=DummyDownloader(), middlewares=( RequestResponseMiddleware(request_factory=DummyRequest, bodyjson=False), collect_middleware, ), ) pomp.pump(RaiseOnProcessingDoneCralwer()) # one exception on request middleware plus one on exception processing assert len(collect_middleware.exceptions) == 1 assert len(collect_middleware.requests) == 1 assert len(collect_middleware.responses) == 1
def test_crawler_without_next_request_method_result(self): class CrawlerWithoutNextRequestMethod(Crawler): def next_requests(self, *args, **kwargs): pass road = RoadPipeline() pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ road, ], ) pomp.pump(CrawlerWithoutNextRequestMethod()) assert set([item.url for item in road.collection]) == set([ 'http://python.org/1', 'http://python.org/1/trash', ])
def test_concurrent_crawler(self): req_resp_midlleware = RequestResponseMiddleware(prefix_url=self.httpd.location, request_factory=lambda x: x) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentUrllibDownloader( pool_size=2, middlewares=[UrllibAdapterMiddleware(), collect_middleware] ) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp(downloader=downloader, pipelines=[]) pomp.pump(ConcurrentCrawler(pool_size=2, worker_class=MockedCrawlerWorker)) assert_set_equal( set([r.url.replace(self.httpd.location, "") for r in collect_middleware.requests]), set(self.httpd.sitemap.keys()), )
def test_pipeline(self): class IncPipeline(BasePipeline): def process(self, crawler, item): item.value += 1 return item class FilterPipeline(BasePipeline): def process(self, crawler, item): if 'trash' in item.url: return None return item class SavePipeline(BasePipeline): def __init__(self, collection): self.collection = collection def process(self, crawler, item): self.collection.append(item) return item result = [] pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ IncPipeline(), FilterPipeline(), SavePipeline(result), ], ) pomp.pump(Crawler()) assert [(item.url, item.value) for item in result] == [ ('http://python.org/1', 2), ('http://python.org/2', 2), ]
def test_exception_on_processing_done(): collect_middleware = CollectRequestResponseMiddleware() pomp = Pomp( downloader=DummyDownloader(), middlewares=( RequestResponseMiddleware( request_factory=DummyRequest, bodyjson=False ), collect_middleware, ), ) pomp.pump(RaiseOnProcessingDoneCralwer()) # one exception on request middleware plus one on exception processing assert len(collect_middleware.exceptions) == 1 assert len(collect_middleware.requests) == 1 assert len(collect_middleware.responses) == 1
def test_pipeline(self): class IncPipeline(BasePipeline): def process(self, crawler, item): item.value += 1 return item class FilterPipeline(BasePipeline): def process(self, crawler, item): if 'trash' in item.url: return None return item class SavePipeline(BasePipeline): def __init__(self, collection): self.collection = collection def process(self, crawler, item): self.collection.append(item) return item result = [] pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), pipelines=[ IncPipeline(), FilterPipeline(), SavePipeline(result), ], ) pomp.pump(Crawler()) assert_equal([(item.url, item.value) for item in result], [ ('http://python.org/1', 2), ('http://python.org/2', 2), ])
def test_crawler_dive_methods(self): road = RoadPipeline() pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), pipelines=[ road, ], ) # Depth first method pomp.pump(Crawler()) assert_equal(set([item.url for item in road.collection]), set([ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ])) # Width first method road.reset() class DummyWidthCrawler(Crawler): CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD pomp.pump(DummyWidthCrawler()) assert_equal(set([item.url for item in road.collection]), set([ 'http://python.org/1', 'http://python.org/2', 'http://python.org/1/trash', ]))
def test_timeout(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url=self.httpd.location, request_factory=TwistedHttpRequest, ) collect_middleware = CollectRequestResponseMiddleware() downloader = TwistedDownloader( reactor, timeout=0.5, middlewares=[collect_middleware] ) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp( downloader=downloader, pipelines=[PrintPipeline()], ) class Crawler(DummyCrawler): ENTRY_REQUESTS = '/sleep' done_defer = defer.Deferred() d = pomp.pump(Crawler()) d.add_callback(done_defer.callback) def check(x): assert len(collect_middleware.exceptions) == 1 e = collect_middleware.exceptions[0] assert isinstance(e, BaseDownloadException) # twisted _newcleint can raise ResponseNeverReceived # next assert works only for `oldclient` # assert isinstance(e.exception, defer.CancelledError) done_defer.addCallback(check) return done_defer
def test_concurrent_downloader(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url="http://localhost", request_factory=UrllibHttpRequest ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentDownloader( pool_size=5, worker_class=MockedDownloadWorker, worker_kwargs=None, middlewares=(collect_middleware,) ) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp(downloader=downloader, pipelines=[]) class Crawler(DummyCrawler): ENTRY_REQUESTS = "/root" pomp.pump(Crawler()) assert_set_equal( set([r.url.replace("http://localhost", "") for r in collect_middleware.requests]), set(MockedDownloadWorker.sitemap.keys()), )
def test_queue_crawler(self): road = RoadPipeline() class SimpleQueue(BaseQueue): def __init__(self): self.requests = [] def get_requests(self): try: return self.requests.pop() except IndexError: return # empty queue def put_requests(self, request): self.requests.append(request) queue = SimpleQueue() pomp = Pomp( downloader=DummyDownloader(middlewares=[url_to_request_middl]), pipelines=[ road, ], queue=queue, ) class DummyWidthCrawler(Crawler): CRAWL_METHOD = CRAWL_WIDTH_FIRST_METHOD pomp.pump(DummyWidthCrawler()) assert_equal(set([item.url for item in road.collection]), set([ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ]))
def test_queue_crawler(self): road = RoadPipeline() class SimpleQueue(BaseQueue): def __init__(self): self.requests = [] def get_requests(self, count=None): # because downloader without workers assert count is None try: return self.requests.pop() except IndexError: return # empty queue def put_requests(self, request): self.requests.append(request) pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ road, ], ) # override internal queue with own pomp.queue = SimpleQueue() pomp.pump(Crawler()) assert set([item.url for item in road.collection]) == set([ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ])
def test_exception_on_downloader_worker(self): req_resp_midlleware = RequestResponseMiddleware( prefix_url="http://localhost", request_factory=UrllibHttpRequest ) collect_middleware = CollectRequestResponseMiddleware() downloader = ConcurrentDownloader( pool_size=5, worker_class=MockedDownloadWorkerWithException, worker_kwargs=None, middlewares=(collect_middleware,), ) downloader.middlewares.insert(0, req_resp_midlleware) pomp = Pomp(downloader=downloader, pipelines=[]) class Crawler(DummyCrawler): ENTRY_REQUESTS = "/root" pomp.pump(Crawler()) assert_equal(len(collect_middleware.requests), 1) assert_equal(len(collect_middleware.exceptions), 1)
def test_dive_methods(self, crawler_class=None): crawler_class = crawler_class or Crawler road = RoadPipeline() pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ road, ], breadth_first=False, ) # Depth first method pomp.pump(crawler_class()) log.debug("in road %s", [item.url for item in road.collection]) assert [item.url for item in road.collection] == [ 'http://python.org/1', 'http://python.org/1/trash', 'http://python.org/2', ] # Width first method road.reset() pomp = Pomp( downloader=DummyDownloader(), middlewares=[url_to_request_middl], pipelines=[ road, ], breadth_first=True, ) pomp.pump(crawler_class()) log.debug("in road %s", [item.url for item in road.collection]) assert [item.url for item in road.collection] == [ 'http://python.org/1', 'http://python.org/2', 'http://python.org/1/trash', ]
# so we need to protect the main code with 'if __name__ == '__main__':' # to avoid creating subprocesses recursively: if __name__ == '__main__': pool_size = 2 start_time = datetime.now() # start phantomjs nodes ph_drivers = deque( [ webdriver.PhantomJS() for _ in range(pool_size) ] ) # Grab URLs of all cities city_pomp = Pomp( downloader=PhantomDownloader( pool_size=2, worker_class=PhantomWorker, phantom_drivers=ph_drivers, ), pipelines=[PhobiaCityPipeline()] ) city_pomp.pump(PhobiaCityCrawler()) engine = db_connect() Session = sessionmaker(bind=engine) se = Session() all_cities = [(city.id, city.url) for city in se.query(PhobiaCity).all()] se.close() statistics = StatisticMiddleware() for city_id, city_url in all_cities: quest_pomp = Pomp(
# extract next urls for link in response.tree.xpath(self.NEXT_URLS_XPATH): yield UrllibHttpRequest(urljoin(self.BASE_URL, link.get('href'))) if __name__ == '__main__': from pomp.core.engine import Pomp from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader statistics = StatisticMiddleware() middlewares = ( statistics, LXMLDownloaderMiddleware(encoding='utf-8'), ) filepath = os.path.join(tempfile.gettempdir(), 'quotes.csv') pomp = Pomp( downloader=ConcurrentUrllibDownloader( pool_size=3, ), middlewares=middlewares, pipelines=( PrintPipeline(), CsvPipeline(filepath, delimiter=';', quotechar='"'), ), ) pomp.pump(QuotesSpider()) print("Statistics:\n %s" % statistics)
if self._next_requests: yield self._next_requests.pop() if __name__ == '__main__': from pomp.core.engine import Pomp try: from pomp.contrib.concurrenttools import ConcurrentUrllibDownloader \ as dnl except ImportError: from pomp.contrib.urllibtools import ThreadedDownloader as dnl statistics = StatisticMiddleware() middlewares = ( statistics, LXMLDownloaderMiddleware(encoding='utf-8'), ) filepath = os.path.join(tempfile.gettempdir(), 'dmoz.csv') pomp = Pomp( downloader=dnl(middlewares=middlewares, timeout=10), pipelines=[ PrintPipeline(), CsvPipeline(filepath, delimiter=';', quotechar='"'), ], ) pomp.pump(DmozSpider()) print("Statistics:\n", statistics)
for request in iterator(requests): response = self._fetch(request) responses.append(response) return responses def _fetch(self, request): try: res = requestslib.get(request.url) return ReqResponse(request, res) except Exception as e: print('Exception on %s: %s', request, e) return BaseDownloadException(request, exception=e) if __name__ == '__main__': from pomp.core.base import BaseCrawler from pomp.core.engine import Pomp class Crawler(BaseCrawler): ENTRY_REQUESTS = ReqRequest('http://python.org/news/') def extract_items(self, response): print(response.body) def next_requests(self, response): return None # one page crawler pomp = Pomp(downloader=RequestsDownloader(), ) pomp.pump(Crawler())
# follow to the first two persons from `also likes` for href in response.tree.xpath(self.ALSO_LIKES_LINKS_XPATH)[:2]: # do not repeat requests url = urljoin(self.BASE_URL, href) if url not in self._parsed_also_likes: yield PhantomRequest(url, level=level + 1) if __name__ == '__main__': from pomp.core.engine import Pomp from e02_quotes import ( PrintPipeline, LXMLDownloaderMiddleware, StatisticMiddleware, ) statistics = StatisticMiddleware() pomp = Pomp( downloader=PhantomDownloader( pool_size=2, worker_class=PhantomDownloadWorker, ), middlewares=( statistics, LXMLDownloaderMiddleware(), ), pipelines=[PrintPipeline()], ) pomp.pump(TwitterSpider()) print("Statistics:\n %s" % statistics)
import re from pomp.core.base import BaseCrawler from pomp.contrib.urllibtools import UrllibHttpRequest python_sentence_re = re.compile('[\w\s]{0,}python[\s\w]{0,}', re.I | re.M) class MyCrawler(BaseCrawler): """Extract all sentences with `python` word""" ENTRY_REQUESTS = UrllibHttpRequest('http://python.org/news') # entry point def extract_items(self, response): for i in python_sentence_re.findall(response.body.decode('utf-8')): sentence = i.strip() print("Sentence: {}".format(sentence)) yield sentence if __name__ == '__main__': from pomp.core.engine import Pomp from pomp.contrib.urllibtools import UrllibDownloader pomp = Pomp( downloader=UrllibDownloader(), ) pomp.pump(MyCrawler())