Exemplo n.º 1
0
    def test_get_crawl_priority_and_depth(self):
        test_whitelist = {
            "http://www.google.com" : {},
            "http://www.sina.com.cn" : {"crawl_priority" : 2},
            "http://www.sina.cn" : {"crawl_priority" : 2, "crawl_depth" : 3},
        }

        test_set = [
            ("http://www.google.jp", "offline", 1, 2), #not whitelist, domain, offline
            ("http://news.sina.com.cn", "online", 2, 0), #whitelist, subdomain, online
            ("http://news.sina.com", "online", 0, 0), #not whitelist, subdomain, online
            ("http://news.sina.cn/a/b", "offline", 2, 3), #whitelist, others, offline
        ]

        crawlerdb.config("localhost", database="test_db")
        crawlerdb.db.crawlDomainWhitelist.drop()
        for url, config in test_whitelist.items():
            crawlerdb.save_crawl_domain_info(url, \
                crawl_priority = config.get("crawl_priority", -1), crawl_depth = config.get("crawl_depth", -1))

        for url, source, expected_priority, expected_depth in test_set:
            print url
            priority, depth = CrawlUrlHelper.get_crawl_priority_and_depth(url, source)
            self.assertEqual(priority, expected_priority)
            self.assertEqual(depth, expected_depth)

        crawlerdb.db.crawlDomainWhitelist.drop()
Exemplo n.º 2
0
    def test_match_whitelist(self):
        test_whitelist = [
            "http://www.google.com",
            "http://www.sina.com.cn",
            "http://www.sina.cn",
        ]

        test_set = [
            ("http://www.google.jp", "domain", False),
            ("http://www.google.jp", "full_domain", False),
            ("http://news.sina.com.cn", "full_domain", False),
            ("http://news.sina.com", "full_domain", False),
            ("http://news.sina.com.cn", "host", False),
            ("http://news.sina.cn", "domain", False),
            ("http://news.sina.com.cn/a/b", "host", False),
            ("http://3g.sina.cn/a/b", "domain", False),
        ]

        crawlerdb.config("localhost", database="test_db")
        crawlerdb.db.crawlDomainWhitelist.drop()
        for url in test_whitelist:
            crawlerdb.save_crawl_domain_info(url)

        for url, match_section, expected in test_set:
            common_settings.general_crawl_policies["url_match_domain_type"] = match_section
            common_settings.general_crawl_policies["url_match_target"] = "whitelist"
            self.assertEqual(CrawlUrlHelper.valid_crawl_url(url, None), expected)

        crawlerdb.db.crawlDomainWhitelist.drop()
Exemplo n.º 3
0
    def _test(self):
        crawlerdb.config("localhost", database="test_db")
        crawlerdb.db.urlRepository.drop()
        ccrawler.handler.handler.HandlerRepository.process = CrawlHandlerTest.process

        handler = crawl_handler.CrawlHandler()
        for i in range(len(CrawlHandlerTest.test_set)):
            test_data = CrawlHandlerTest.test_set[i]
            url = test_data["message"]["url"]
            common_settings.general_crawl_policies["external_crawl_mode"] = test_data.get("external_crawl_mode", "continue")
            handler._process(test_data["message"])

            expected_url_info = test_data["expected_url_info"]
            expected_url_info["url"] = url
            expected_url_info["source"] = test_data["message"]["source"]
            expected_url_info["url_class"] = None
            expected_url_info["error_message"] = None
            expected_url_info["crawled_count"] = 0
            expected_url_info["last_crawled"] = None
            expected_url_info["crawl_status"] = "crawling"
            expected_url_info["page_last_modified"] = None
            expected_url_info["original_url"] = None
            if not expected_url_info.has_key("discovered_count"):
                expected_url_info["discovered_count"] = 1

            url_info = crawlerdb.db.urlRepository.find_one({"url" : url})
            self.assertTrue(url_info is not None)
            self.assertTrue(url_info["last_discovered"] is not None)
            self.assertTrue(url_info["created_time"] is not None)
            for field, value in expected_url_info.items():
                self.assertTrue(value == url_info[field], "%s,%s,%s" % (url, field, test_data))
            self.assertTrue(test_data["new_msg"] == (CrawlHandlerTest.mock_message_output != None))
            if test_data["new_msg"]:
                msg = CrawlHandlerTest.mock_message_output
                CrawlHandlerTest.mock_message_output = None
                self.assertTrue(msg["url"] == url)
                self.assertTrue(msg["page_last_modified"] == expected_url_info["page_last_modified"])
                self.assertTrue(msg["__priority"] == expected_url_info["crawl_priority"])

        crawlerdb.db.urlRepository.drop()
    def test(self):
        common_settings.mqclient = CrawlerResponseHandlerTest.mqclient
        ccrawler.handler.handler.HandlerRepository.init({})
        crawlerdb.config("localhost", database="test_db")
        crawlerdb.db.urlRepository.drop()
        crawlerdb.db.rawDocs.drop()
        crawlerdb.db.urlRedirects.drop()
        crawlerdb.db.crawlDomainWhitelist.drop()
        for url in CrawlerResponseHandlerTest.whitelist:
            crawlerdb.add_crawl_domain_whitelist(url)

        for message in CrawlerResponseHandlerTest.initial_set:
            ccrawler.handler.handler.HandlerRepository.process("crawl_request", message)

        decoder.decode = CrawlerResponseHandlerTest.decode
        ccrawler.handler.handler.HandlerRepository.process = CrawlerResponseHandlerTest.process

        handler = CrawlerResponseHandler()
        for i in range(len(CrawlerResponseHandlerTest.test_set)):
            test_data = CrawlerResponseHandlerTest.test_set[i]
            common_settings.general_crawl_policies["external_crawl_mode"] = test_data.get("external_crawl_mode", "continue")
            common_settings.general_crawl_policies["url_match_target"] = "whitelist"
            url = test_data["message"]["url"]
            print i, url
            if test_data["expected_url_info"].has_key("url"):
                url = test_data["expected_url_info"]["url"]

            handler._process(test_data["message"])

            expected_url_info = test_data["expected_url_info"]
            expected_url_info["url"] = url
            #expected_url_info["source"] = test_data["message"]["source"]
            expected_url_info["url_class"] = None
            expected_url_info["parent_url"] = None
            expected_url_info["root_url"] = test_data["message"]["original_url"]
            if not expected_url_info.has_key("original_url"):
                expected_url_info["original_url"] = None
            if not expected_url_info.has_key("crawl_priority"):
                expected_url_info["crawl_priority"] = 3
            if not expected_url_info.has_key("crawl_depth"):
                expected_url_info["crawl_depth"] = 3
            if not expected_url_info.has_key("discovered_count"):
                expected_url_info["discovered_count"] = 1

            if not expected_url_info.has_key("modified_count"):
                expected_url_info["modified_count"] = 0

            url_info = crawlerdb.db.urlRepository.find_one({"url" : url})
            self.assertTrue(url_info is not None)
            self.assertTrue(url_info["last_discovered"] is not None)
            self.assertTrue(url_info["created_time"] is not None)
            if not test_data.has_key("expected_raw_doc"):
                self.assertTrue(url_info["last_modified"] is not None)
                self.assertTrue(url_info["first_modified"] is not None)

            for field, value in expected_url_info.items():
                self.assertTrue(value == url_info[field], "%d,%s,%s,%s" %(i, field, value, url_info[field]))

            raw_doc = crawlerdb.db.rawDocs.find_one({"url" : url})
            test_data["expected_raw_doc"] = test_data.get("expected_raw_doc", {})
            if test_data["expected_raw_doc"] is None:
                self.assertTrue(raw_doc is None)
            else:
                self.assertTrue(raw_doc is not None)
                self._test_raw_doc(test_data["expected_raw_doc"], test_data["message"], raw_doc)

            self.assertTrue(crawlerdb.db.urlRedirects.count() == test_data["url_redirects_count"],
                "%d,%d,%d" % (i, crawlerdb.db.urlRedirects.count(), test_data["url_redirects_count"]))

            self.assertTrue(test_data["new_msg"] == (CrawlerResponseHandlerTest.mock_message_output != None))
            if test_data["new_msg"]:
                msg = CrawlerResponseHandlerTest.mock_message_output
                CrawlerResponseHandlerTest.mock_message_output = None
                self.assertTrue(msg["url"] == url)

            #mock process finished
            crawlerdb.db.urlRepository.update({"url" : url}, {"$set" : {"crawl_status" : "alive"}})

        crawlerdb.db.urlRepository.drop()
        crawlerdb.db.rawDocs.drop()
        crawlerdb.db.urlRedirects.drop()