def test_status_codes_gt_200(self, objects, declare_queue, sync, cdobjects): inst = objects.return_value inst.first.return_value = self.host declare_queue.side_effect = declare_queue_side_effect crawl_response = { "url": "http://example.com/home.html", "body": "<html></html>", "actions": ["index"], "status_code": 500, "headers": { "content-type": "text/html" }, "crawl_time": datetime.now(timezone.utc).isoformat() } now = datetime.now(timezone.utc) cd = CrawlDocument() cd.host = "example.com" cd.url = crawl_response['url'] cd.url_hash = hash_url(crawl_response['url']) cd.latest_request = { "url": crawl_response['url'], "cookies": {}, "method": "GET", "actions": ["follow", "index"] } cd.save() cinst = cdobjects.return_value cinst.first.return_value = cd self.crawl_manager.process_task( ujson.dumps(crawl_response).encode("utf8") ) crawl_response['status_code'] = 302 self.crawl_manager.process_task( ujson.dumps(crawl_response).encode("utf8") ) self.assertEqual(sync.call_count, 2)
def test_crawl_document(self): cd = CrawlDocument(url="http://example.com", host="example.com") cd.save() self.assertEqual(cd.url_hash, hash_url(cd.url))