Пример #1
0
    def test_status_codes_gt_200(self, objects, declare_queue, sync,
                                 cdobjects):


        inst = objects.return_value
        inst.first.return_value = self.host

        declare_queue.side_effect = declare_queue_side_effect

        crawl_response = {
            "url": "http://example.com/home.html",
            "body": "<html></html>",
            "actions": ["index"],
            "status_code": 500,
            "headers": {
                "content-type": "text/html"
            },
            "crawl_time": datetime.now(timezone.utc).isoformat()
        }

        now = datetime.now(timezone.utc)

        cd = CrawlDocument()
        cd.host = "example.com"
        cd.url = crawl_response['url']
        cd.url_hash = hash_url(crawl_response['url'])
        cd.latest_request = {
            "url": crawl_response['url'],
            "cookies": {},
            "method": "GET",
            "actions": ["follow", "index"]
        }
        cd.save()
        cinst = cdobjects.return_value
        cinst.first.return_value = cd

        self.crawl_manager.process_task(
            ujson.dumps(crawl_response).encode("utf8")
        )

        crawl_response['status_code'] = 302

        self.crawl_manager.process_task(
            ujson.dumps(crawl_response).encode("utf8")
        )

        self.assertEqual(sync.call_count, 2)
Пример #2
0
    def test_crawl_document(self):

        cd = CrawlDocument(url="http://example.com", host="example.com")
        cd.save()
        self.assertEqual(cd.url_hash, hash_url(cd.url))