def test_that_stopping_worker_via_mgmt_works(self): worker = ZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, self.echo_processing, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost") msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_correct_data_answer(msg2): self.assertEqual(msg, DataMessage(msg2)) self._worker_sockets['master_sub'].on_recv(assert_correct_data_answer) def assert_correct_mgmt_answer(msg3): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg3.data) self._mgmt_sockets['master_sub'].on_recv(assert_correct_data_answer) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_that_async_worker_works(self): worker = AsyncZmqWorker( self._worker_sockets["worker_pull"], self._worker_sockets["worker_pub"], self._mgmt, self.echo_processing, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop, ) worker.start() curi = CrawlUri(url="http://localhost") msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_correct_data(msg2): msg3 = DataMessage(msg2) self.assertEqual(msg, msg3) self._worker_sockets["master_sub"].on_recv(assert_correct_data) def assert_correct_mgmt(msg4): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data) self._mgmt_sockets["master_sub"].on_recv(assert_correct_mgmt) self._worker_sockets["master_push"].send_multipart(msg.serialize()) self._io_loop.start() worker._in_stream.flush()
def test_fetching_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, ) msg = DataMessage() msg.identity = "me" msg.curi = curi self._worker_sockets['master_push'].send_multipart(msg.serialize()) def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) robots = open(os.path.join(os.path.dirname(__file__), "static/robots.txt")).read() self.assertEqual(robots, msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._io_loop.start()
def _send_next_uri(self): """ See if there are more uris to process and send them to the workers if there are any. At this point there is a very small heuristic in order to maximize the throughput: try to keep the `self._out_stream._send_queue` full. """ if not self._running: self._logger.error("Master is not running, not sending more uris") return num_workers = len(self._available_workers) if self._running and num_workers > 0: while self._out_stream._send_queue.qsize() < num_workers * 4: try: next_curi = self._frontier.get_next() except Empty: # well, frontier has nothing to process right now self._logger.debug("zmqmaster::Nothing to crawl right now") break self._logger.info("zmqmaster::Begin crawling next URL (%s)" % next_curi.url) msg = DataMessage(identity=self._identity, curi=next_curi) self._out_stream.send_multipart(msg.serialize())
def test_that_stopping_worker_via_mgmt_works(self): worker = ZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, self.echo_processing, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost") msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_correct_data_answer(msg2): self.assertEqual(msg, DataMessage(msg2)) self._worker_sockets['master_sub'].on_recv(assert_correct_data_answer) def assert_correct_mgmt_answer(msg3): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg3.data) self._mgmt_sockets['master_sub'].on_recv(assert_correct_data_answer) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri( url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, ) msg = DataMessage() msg.identity = "me" msg.curi = curi self._worker_sockets['master_push'].send_multipart(msg.serialize()) def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) robots = open( os.path.join(os.path.dirname(__file__), "static/robots.txt")).read() self.assertEqual(robots, msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._io_loop.start()
def test_that_construction_works(self): msg = DataMessage(identity="me") self.assertEqual("me", msg.identity) self.assertEqual(None, msg.curi) msg = DataMessage(curi="bla") self.assertEqual("bla", msg.curi) self.assertEqual(None, msg.identity)
def test_that_data_messages_work(self): identity = "me myself and i" curi = CrawlUri(url="http://localhost") serialized = serialize_crawl_uri(curi) msg = DataMessage([identity, serialized]) self.assertEqual(identity, msg.identity) self.assertEqual(curi, msg.curi) self.assertEqual([identity, serialized], msg.serialize()) self.assertEqual(msg, DataMessage(msg.serialize()))
def test_that_data_messages_work(self): identity = "me myself and i" curi = CrawlUri(url="http://localhost") serialized = serialize_crawl_uri(curi) msg = DataMessage([identity, serialized]) self.assertEqual(identity, msg.identity) self.assertEqual(curi, msg.curi) self.assertEqual([identity, serialized], msg.serialize()) self.assertEqual(msg, DataMessage(msg.serialize()))
def assert_expected_result_and_stop(raw_msg): msg2 = DataMessage(raw_msg) self.assertEqual(CURI_OPTIONAL_TRUE, msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED]) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
def _receive_processed_uri(self, raw_msg): """ Receive and reschedule an URI that has been processed. Additionally add all extracted URLs to the frontier. """ msg = DataMessage(raw_msg) self._logger.info("zmqmaster::Crawling URL (%s) finished" % msg.curi.url) try: if 200 <= msg.curi.status_code < 300: # we have some kind of success code! yay self._frontier.process_successful_crawl(msg.curi) elif 300 <= msg.curi.status_code < 400: # Some kind of redirect code. This will only happen if the number # of redirects exceeds settings.MAX_REDIRECTS self._frontier.process_redirect(msg.curi) elif 400 <= msg.curi.status_code < 500: # some kind of error where the resource could not be found. self._frontier.process_not_found(msg.curi) elif 500 <= msg.curi.status_code < 600: # some kind of server error self._frontier.process_server_error(msg.curi) except: self._logger.critical("zmqmaster::Uncaught exception in the sink") self._logger.critical("zmqmaster::%s" % (traceback.format_exc(),)) msg.curi.status_code = CURI_EUNCAUGHT_EXCEPTION self._frontier.process_server_error(msg.curi) self._send_next_uri()
def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) robots = open( os.path.join(os.path.dirname(__file__), "static/robots.txt")).read() self.assertEqual(robots, msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
def test_that_creating_extractor_works(self): self._settings.SPYDER_EXTRACTOR_PIPELINE = [ 'spyder.processor.limiter.DefaultLimiter', ] extractor = workerprocess.create_worker_extractor( self._settings, self._mgmt, self._ctx, StreamHandler(sys.stdout), self._io_loop) extractor.start() curi = CrawlUri( url="http://localhost:80/robots.txt", effective_url="http://127.0.0.1:%s/robots.txt", optional_vars=dict(), ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg2 = DataMessage(raw_msg) self.assertEqual(CURI_OPTIONAL_TRUE, msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED]) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) def assert_correct_mgmt_message(raw_msg): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, raw_msg) self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt_message) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start() extractor._out_stream.close() extractor._outsocket.close() extractor._in_stream.close() extractor._insocket.close()
def _receive(self, msg): """ We have a message! `msg` is a serialized version of a `DataMessage`. """ message = DataMessage(msg) try: # this is the real work we want to do curi = self._processing(message.curi) message.curi = curi except: # catch any uncaught exception and only log it as CRITICAL self._logger.critical( "worker::Uncaught exception executing the worker for URL %s!" % (message.curi.url,)) self._logger.critical("worker::%s" % (traceback.format_exc(),)) # finished, now send the result back to the master self._out_stream.send_multipart(message.serialize())
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header = { "Last-Modified" : mtime } ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_that_creating_extractor_works(self): self._settings.SPYDER_EXTRACTOR_PIPELINE = ['spyder.processor.limiter.DefaultLimiter',] extractor = workerprocess.create_worker_extractor(self._settings, self._mgmt, self._ctx, StreamHandler(sys.stdout), self._io_loop) extractor.start() curi = CrawlUri(url="http://localhost:80/robots.txt", effective_url="http://127.0.0.1:%s/robots.txt", optional_vars=dict(), ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg2 = DataMessage(raw_msg) self.assertEqual(CURI_OPTIONAL_TRUE, msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED]) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) def assert_correct_mgmt_message(raw_msg): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, raw_msg) self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt_message) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start() extractor._out_stream.close() extractor._outsocket.close() extractor._in_stream.close() extractor._insocket.close()
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp( os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={"Last-Modified": mtime}) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_etag_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header = { "Etag" : "\"3926227169c58185234888b60000c6eb1169577d\"" } ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_etag_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri( url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={ "Etag": "\"3926227169c58185234888b60000c6eb1169577d\"" }) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def _receive(self, msg): """ We have a message! Instead of the synchronous version we do not handle serializing and sending the result to the `self._outsocket`. This has to be handled by the `self._processing` method. """ message = DataMessage(msg) try: self._processing(message, self._out_stream) except: # catch any uncaught exception and only log it as CRITICAL self._logger.critical("Uncaught exception executing the worker!") self._logger.critical(traceback.format_exc())
def assert_correct_data_answer(msg2): self.assertEqual(msg, DataMessage(msg2))
def assert_correct_data(msg2): msg3 = DataMessage(msg2) self.assertEqual(msg, msg3)