def test_fetching_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri( url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, ) msg = DataMessage() msg.identity = "me" msg.curi = curi self._worker_sockets['master_push'].send_multipart(msg.serialize()) def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) robots = open( os.path.join(os.path.dirname(__file__), "static/robots.txt")).read() self.assertEqual(robots, msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._io_loop.start()
def test_that_async_worker_works(self): worker = AsyncZmqWorker( self._worker_sockets["worker_pull"], self._worker_sockets["worker_pub"], self._mgmt, self.echo_processing, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop, ) worker.start() curi = CrawlUri(url="http://localhost") msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_correct_data(msg2): msg3 = DataMessage(msg2) self.assertEqual(msg, msg3) self._worker_sockets["master_sub"].on_recv(assert_correct_data) def assert_correct_mgmt(msg4): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data) self._mgmt_sockets["master_sub"].on_recv(assert_correct_mgmt) self._worker_sockets["master_push"].send_multipart(msg.serialize()) self._io_loop.start() worker._in_stream.flush()
def test_that_async_worker_works(self): worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, self.echo_processing, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost") msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_correct_data(msg2): msg3 = DataMessage(msg2) self.assertEqual(msg, msg3) self._worker_sockets['master_sub'].on_recv(assert_correct_data) def assert_correct_mgmt(msg4): self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data) self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start() worker._in_stream.flush()
def test_fetching_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, ) msg = DataMessage() msg.identity = "me" msg.curi = curi self._worker_sockets['master_push'].send_multipart(msg.serialize()) def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) robots = open(os.path.join(os.path.dirname(__file__), "static/robots.txt")).read() self.assertEqual(robots, msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._io_loop.start()
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header = { "Last-Modified" : mtime } ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp( os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={"Last-Modified": mtime}) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_etag_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header = { "Etag" : "\"3926227169c58185234888b60000c6eb1169577d\"" } ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_fetching_etag_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() curi = CrawlUri( url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={ "Etag": "\"3926227169c58185234888b60000c6eb1169577d\"" }) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def create_worker_fetcher(settings, mgmt, zmq_context, log_handler, io_loop): """ Create and return a new `Worker Fetcher`. """ pulling_socket = zmq_context.socket(zmq.PULL) pulling_socket.connect(settings.ZEROMQ_WORKER_PROC_FETCHER_PULL) pushing_socket = zmq_context.socket(zmq.PUSH) pushing_socket.setsockopt(zmq.HWM, settings.ZEROMQ_WORKER_PROC_FETCHER_PUSH_HWM) pushing_socket.bind(settings.ZEROMQ_WORKER_PROC_FETCHER_PUSH) fetcher = FetchProcessor(settings, io_loop) return AsyncZmqWorker(pulling_socket, pushing_socket, mgmt, fetcher, log_handler, settings.LOG_LEVEL_WORKER, io_loop)