def test_that_stopping_worker_via_mgmt_works(self): worker = ZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, self.echo_processing, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost") msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_correct_data_answer(msg2): self.assertEqual(msg, DataMessage(msg2)) self._worker_sockets['master_sub'].on_recv(assert_correct_data_answer) def assert_correct_mgmt_answer(msg3): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg3.data) self._mgmt_sockets['master_sub'].on_recv(assert_correct_data_answer) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, ) msg = DataMessage() msg.identity = "me" msg.curi = curi self._worker_sockets['master_push'].send_multipart(msg.serialize()) def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) robots = open(os.path.join(os.path.dirname(__file__), "static/robots.txt")).read() self.assertEqual(robots, msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._io_loop.start()
def test_fetching_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri( url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, ) msg = DataMessage() msg.identity = "me" msg.curi = curi self._worker_sockets['master_push'].send_multipart(msg.serialize()) def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) robots = open( os.path.join(os.path.dirname(__file__), "static/robots.txt")).read() self.assertEqual(robots, msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._io_loop.start()
def test_that_async_worker_works(self): worker = AsyncZmqWorker( self._worker_sockets["worker_pull"], self._worker_sockets["worker_pub"], self._mgmt, self.echo_processing, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop, ) worker.start() curi = CrawlUri(url="http://localhost") msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_correct_data(msg2): msg3 = DataMessage(msg2) self.assertEqual(msg, msg3) self._worker_sockets["master_sub"].on_recv(assert_correct_data) def assert_correct_mgmt(msg4): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data) self._mgmt_sockets["master_sub"].on_recv(assert_correct_mgmt) self._worker_sockets["master_push"].send_multipart(msg.serialize()) self._io_loop.start() worker._in_stream.flush()
def test_that_stopping_worker_via_mgmt_works(self): worker = ZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, self.echo_processing, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost") msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_correct_data_answer(msg2): self.assertEqual(msg, DataMessage(msg2)) self._worker_sockets['master_sub'].on_recv(assert_correct_data_answer) def assert_correct_mgmt_answer(msg3): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg3.data) self._mgmt_sockets['master_sub'].on_recv(assert_correct_data_answer) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_that_creating_extractor_works(self): self._settings.SPYDER_EXTRACTOR_PIPELINE = [ 'spyder.processor.limiter.DefaultLimiter', ] extractor = workerprocess.create_worker_extractor( self._settings, self._mgmt, self._ctx, StreamHandler(sys.stdout), self._io_loop) extractor.start() curi = CrawlUri( url="http://localhost:80/robots.txt", effective_url="http://127.0.0.1:%s/robots.txt", optional_vars=dict(), ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg2 = DataMessage(raw_msg) self.assertEqual(CURI_OPTIONAL_TRUE, msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED]) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) def assert_correct_mgmt_message(raw_msg): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, raw_msg) self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt_message) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start() extractor._out_stream.close() extractor._outsocket.close() extractor._in_stream.close() extractor._insocket.close()
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header = { "Last-Modified" : mtime } ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_that_creating_extractor_works(self): self._settings.SPYDER_EXTRACTOR_PIPELINE = ['spyder.processor.limiter.DefaultLimiter',] extractor = workerprocess.create_worker_extractor(self._settings, self._mgmt, self._ctx, StreamHandler(sys.stdout), self._io_loop) extractor.start() curi = CrawlUri(url="http://localhost:80/robots.txt", effective_url="http://127.0.0.1:%s/robots.txt", optional_vars=dict(), ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg2 = DataMessage(raw_msg) self.assertEqual(CURI_OPTIONAL_TRUE, msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED]) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) def assert_correct_mgmt_message(raw_msg): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, raw_msg) self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt_message) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start() extractor._out_stream.close() extractor._outsocket.close() extractor._in_stream.close() extractor._insocket.close()
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp( os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={"Last-Modified": mtime}) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_etag_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header = { "Etag" : "\"3926227169c58185234888b60000c6eb1169577d\"" } ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_etag_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri( url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={ "Etag": "\"3926227169c58185234888b60000c6eb1169577d\"" }) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()