def test_build_request(self): r = Request.of("http://test") self.assertEqual(r.url, "http://test") r1 = Request.of(r) self.assertIs(r1, r) with self.assertRaises(TypeError): Request.of(1)
def test_recover(self): spider = RecoverMySpider() spider.filter = spider.filter + CustomFilter(lambda x: True) spider.set_session(env.session) list( run_and_get_result( spider.crawl( Request.of("http://localhost:5000/test_extract")))) spider.stash(EXE_PATH) del spider recovered_spider = RecoverMySpider() recovered_spider.recover(EXE_PATH) self.assertTrue( recovered_spider.crawled_filter.contains( Request.of("http://localhost:5000/a.html")))
def test_recover_queue(self): queue = RecoverableRequestQueue() queue.put(Request.of("1")) queue.put(Request.of("2")) queue.put(Request.of("3")) queue.stash(EXE_PATH) del queue recovered_queue = RecoverableRequestQueue() recovered_queue.recover(EXE_PATH) self.assertFalse(recovered_queue.empty()) i = 1 while not recovered_queue.empty(): request = recovered_queue.get() self.assertEqual(request.url, str(i)) i += 1
def test_auto_save(self): test_data = {"key": 1} spider = MySpider() spider.set_session(mock_env.env.session) spider.start_targets = ["http://localhost:5000/test_extract"] spider.auto_save_frequency = 10 task = CountDownRecoverableTask(spider) task.add_actions(lambda: test_data.clear()) for _ in range(5): mock_env.env.loop.run_until_complete(task.run()) task._request_queue.put( Request.of("http://localhost:5000/test_extract")) self.assertEqual(test_data, {"key": 1}) for _ in range(5): mock_env.env.loop.run_until_complete(task.run()) task._request_queue.put( Request.of("http://localhost:5000/test_extract")) self.assertEqual(test_data, {})
def test_stash(self): spider = MySpider() spider.start_targets = ["http://localhost:5000/test_extract"] task = RecoverableTask(spider) exists(".task_stash") or makedirs(".task_stash") task.stash(".task_stash") del task recovered_task = RecoverableTask(MySpider()) self.assertTrue(recovered_task.can_recover(".task_stash")) recovered_task.recover(".task_stash") self.assertTrue( recovered_task.spider.crawled_filter.contains( Request.of("http://localhost:5000/test_extract")))
def test_crawled_spider(self): spider = MySpider() with self.assertRaises(TypeError): spider.crawled_filter = spider.filter start_request = Request.of("http://localhost:5000/test_extract") self.assertFalse(spider.crawled_filter.accept(start_request))
def test_reg_filter(self): f = URLRegFilter(r"http://") self.assertTrue(f.accept(Request.of("http://www.baidu.com")))
def assert_false(self, r, url): self.assertFalse(r.accept(Request.of(url)))
def assert_true(self, r, url): self.assertTrue(r.accept(Request.of(url)))
def from_url_or_request(url: str): return Request.of(url)