Exemplo n.º 1
0
    def _test(self):
        crawlerdb.config("localhost", database="test_db")
        crawlerdb.db.urlRepository.drop()
        ccrawler.handler.handler.HandlerRepository.process = CrawlHandlerTest.process

        handler = crawl_handler.CrawlHandler()
        for i in range(len(CrawlHandlerTest.test_set)):
            test_data = CrawlHandlerTest.test_set[i]
            url = test_data["message"]["url"]
            common_settings.general_crawl_policies["external_crawl_mode"] = test_data.get("external_crawl_mode", "continue")
            handler._process(test_data["message"])

            expected_url_info = test_data["expected_url_info"]
            expected_url_info["url"] = url
            expected_url_info["source"] = test_data["message"]["source"]
            expected_url_info["url_class"] = None
            expected_url_info["error_message"] = None
            expected_url_info["crawled_count"] = 0
            expected_url_info["last_crawled"] = None
            expected_url_info["crawl_status"] = "crawling"
            expected_url_info["page_last_modified"] = None
            expected_url_info["original_url"] = None
            if not expected_url_info.has_key("discovered_count"):
                expected_url_info["discovered_count"] = 1

            url_info = crawlerdb.db.urlRepository.find_one({"url" : url})
            self.assertTrue(url_info is not None)
            self.assertTrue(url_info["last_discovered"] is not None)
            self.assertTrue(url_info["created_time"] is not None)
            for field, value in expected_url_info.items():
                self.assertTrue(value == url_info[field], "%s,%s,%s" % (url, field, test_data))
            self.assertTrue(test_data["new_msg"] == (CrawlHandlerTest.mock_message_output != None))
            if test_data["new_msg"]:
                msg = CrawlHandlerTest.mock_message_output
                CrawlHandlerTest.mock_message_output = None
                self.assertTrue(msg["url"] == url)
                self.assertTrue(msg["page_last_modified"] == expected_url_info["page_last_modified"])
                self.assertTrue(msg["__priority"] == expected_url_info["crawl_priority"])

        crawlerdb.db.urlRepository.drop()
    def test(self):
        common_settings.mqclient = CrawlerResponseHandlerTest.mqclient
        ccrawler.handler.handler.HandlerRepository.init({})
        crawlerdb.config("localhost", database="test_db")
        crawlerdb.db.urlRepository.drop()
        crawlerdb.db.rawDocs.drop()
        crawlerdb.db.urlRedirects.drop()
        crawlerdb.db.crawlDomainWhitelist.drop()
        for url in CrawlerResponseHandlerTest.whitelist:
            crawlerdb.add_crawl_domain_whitelist(url)

        for message in CrawlerResponseHandlerTest.initial_set:
            ccrawler.handler.handler.HandlerRepository.process("crawl_request", message)

        decoder.decode = CrawlerResponseHandlerTest.decode
        ccrawler.handler.handler.HandlerRepository.process = CrawlerResponseHandlerTest.process

        handler = CrawlerResponseHandler()
        for i in range(len(CrawlerResponseHandlerTest.test_set)):
            test_data = CrawlerResponseHandlerTest.test_set[i]
            common_settings.general_crawl_policies["external_crawl_mode"] = test_data.get("external_crawl_mode", "continue")
            common_settings.general_crawl_policies["url_match_target"] = "whitelist"
            url = test_data["message"]["url"]
            print i, url
            if test_data["expected_url_info"].has_key("url"):
                url = test_data["expected_url_info"]["url"]

            handler._process(test_data["message"])

            expected_url_info = test_data["expected_url_info"]
            expected_url_info["url"] = url
            #expected_url_info["source"] = test_data["message"]["source"]
            expected_url_info["url_class"] = None
            expected_url_info["parent_url"] = None
            expected_url_info["root_url"] = test_data["message"]["original_url"]
            if not expected_url_info.has_key("original_url"):
                expected_url_info["original_url"] = None
            if not expected_url_info.has_key("crawl_priority"):
                expected_url_info["crawl_priority"] = 3
            if not expected_url_info.has_key("crawl_depth"):
                expected_url_info["crawl_depth"] = 3
            if not expected_url_info.has_key("discovered_count"):
                expected_url_info["discovered_count"] = 1

            if not expected_url_info.has_key("modified_count"):
                expected_url_info["modified_count"] = 0

            url_info = crawlerdb.db.urlRepository.find_one({"url" : url})
            self.assertTrue(url_info is not None)
            self.assertTrue(url_info["last_discovered"] is not None)
            self.assertTrue(url_info["created_time"] is not None)
            if not test_data.has_key("expected_raw_doc"):
                self.assertTrue(url_info["last_modified"] is not None)
                self.assertTrue(url_info["first_modified"] is not None)

            for field, value in expected_url_info.items():
                self.assertTrue(value == url_info[field], "%d,%s,%s,%s" %(i, field, value, url_info[field]))

            raw_doc = crawlerdb.db.rawDocs.find_one({"url" : url})
            test_data["expected_raw_doc"] = test_data.get("expected_raw_doc", {})
            if test_data["expected_raw_doc"] is None:
                self.assertTrue(raw_doc is None)
            else:
                self.assertTrue(raw_doc is not None)
                self._test_raw_doc(test_data["expected_raw_doc"], test_data["message"], raw_doc)

            self.assertTrue(crawlerdb.db.urlRedirects.count() == test_data["url_redirects_count"],
                "%d,%d,%d" % (i, crawlerdb.db.urlRedirects.count(), test_data["url_redirects_count"]))

            self.assertTrue(test_data["new_msg"] == (CrawlerResponseHandlerTest.mock_message_output != None))
            if test_data["new_msg"]:
                msg = CrawlerResponseHandlerTest.mock_message_output
                CrawlerResponseHandlerTest.mock_message_output = None
                self.assertTrue(msg["url"] == url)

            #mock process finished
            crawlerdb.db.urlRepository.update({"url" : url}, {"$set" : {"crawl_status" : "alive"}})

        crawlerdb.db.urlRepository.drop()
        crawlerdb.db.rawDocs.drop()
        crawlerdb.db.urlRedirects.drop()