def test_get_crawl_priority_and_depth(self): test_whitelist = { "http://www.google.com" : {}, "http://www.sina.com.cn" : {"crawl_priority" : 2}, "http://www.sina.cn" : {"crawl_priority" : 2, "crawl_depth" : 3}, } test_set = [ ("http://www.google.jp", "offline", 1, 2), #not whitelist, domain, offline ("http://news.sina.com.cn", "online", 2, 0), #whitelist, subdomain, online ("http://news.sina.com", "online", 0, 0), #not whitelist, subdomain, online ("http://news.sina.cn/a/b", "offline", 2, 3), #whitelist, others, offline ] crawlerdb.config("localhost", database="test_db") crawlerdb.db.crawlDomainWhitelist.drop() for url, config in test_whitelist.items(): crawlerdb.save_crawl_domain_info(url, \ crawl_priority = config.get("crawl_priority", -1), crawl_depth = config.get("crawl_depth", -1)) for url, source, expected_priority, expected_depth in test_set: print url priority, depth = CrawlUrlHelper.get_crawl_priority_and_depth(url, source) self.assertEqual(priority, expected_priority) self.assertEqual(depth, expected_depth) crawlerdb.db.crawlDomainWhitelist.drop()
def test_match_whitelist(self): test_whitelist = [ "http://www.google.com", "http://www.sina.com.cn", "http://www.sina.cn", ] test_set = [ ("http://www.google.jp", "domain", False), ("http://www.google.jp", "full_domain", False), ("http://news.sina.com.cn", "full_domain", False), ("http://news.sina.com", "full_domain", False), ("http://news.sina.com.cn", "host", False), ("http://news.sina.cn", "domain", False), ("http://news.sina.com.cn/a/b", "host", False), ("http://3g.sina.cn/a/b", "domain", False), ] crawlerdb.config("localhost", database="test_db") crawlerdb.db.crawlDomainWhitelist.drop() for url in test_whitelist: crawlerdb.save_crawl_domain_info(url) for url, match_section, expected in test_set: common_settings.general_crawl_policies["url_match_domain_type"] = match_section common_settings.general_crawl_policies["url_match_target"] = "whitelist" self.assertEqual(CrawlUrlHelper.valid_crawl_url(url, None), expected) crawlerdb.db.crawlDomainWhitelist.drop()
def _test(self): crawlerdb.config("localhost", database="test_db") crawlerdb.db.urlRepository.drop() ccrawler.handler.handler.HandlerRepository.process = CrawlHandlerTest.process handler = crawl_handler.CrawlHandler() for i in range(len(CrawlHandlerTest.test_set)): test_data = CrawlHandlerTest.test_set[i] url = test_data["message"]["url"] common_settings.general_crawl_policies["external_crawl_mode"] = test_data.get("external_crawl_mode", "continue") handler._process(test_data["message"]) expected_url_info = test_data["expected_url_info"] expected_url_info["url"] = url expected_url_info["source"] = test_data["message"]["source"] expected_url_info["url_class"] = None expected_url_info["error_message"] = None expected_url_info["crawled_count"] = 0 expected_url_info["last_crawled"] = None expected_url_info["crawl_status"] = "crawling" expected_url_info["page_last_modified"] = None expected_url_info["original_url"] = None if not expected_url_info.has_key("discovered_count"): expected_url_info["discovered_count"] = 1 url_info = crawlerdb.db.urlRepository.find_one({"url" : url}) self.assertTrue(url_info is not None) self.assertTrue(url_info["last_discovered"] is not None) self.assertTrue(url_info["created_time"] is not None) for field, value in expected_url_info.items(): self.assertTrue(value == url_info[field], "%s,%s,%s" % (url, field, test_data)) self.assertTrue(test_data["new_msg"] == (CrawlHandlerTest.mock_message_output != None)) if test_data["new_msg"]: msg = CrawlHandlerTest.mock_message_output CrawlHandlerTest.mock_message_output = None self.assertTrue(msg["url"] == url) self.assertTrue(msg["page_last_modified"] == expected_url_info["page_last_modified"]) self.assertTrue(msg["__priority"] == expected_url_info["crawl_priority"]) crawlerdb.db.urlRepository.drop()
def test(self): common_settings.mqclient = CrawlerResponseHandlerTest.mqclient ccrawler.handler.handler.HandlerRepository.init({}) crawlerdb.config("localhost", database="test_db") crawlerdb.db.urlRepository.drop() crawlerdb.db.rawDocs.drop() crawlerdb.db.urlRedirects.drop() crawlerdb.db.crawlDomainWhitelist.drop() for url in CrawlerResponseHandlerTest.whitelist: crawlerdb.add_crawl_domain_whitelist(url) for message in CrawlerResponseHandlerTest.initial_set: ccrawler.handler.handler.HandlerRepository.process("crawl_request", message) decoder.decode = CrawlerResponseHandlerTest.decode ccrawler.handler.handler.HandlerRepository.process = CrawlerResponseHandlerTest.process handler = CrawlerResponseHandler() for i in range(len(CrawlerResponseHandlerTest.test_set)): test_data = CrawlerResponseHandlerTest.test_set[i] common_settings.general_crawl_policies["external_crawl_mode"] = test_data.get("external_crawl_mode", "continue") common_settings.general_crawl_policies["url_match_target"] = "whitelist" url = test_data["message"]["url"] print i, url if test_data["expected_url_info"].has_key("url"): url = test_data["expected_url_info"]["url"] handler._process(test_data["message"]) expected_url_info = test_data["expected_url_info"] expected_url_info["url"] = url #expected_url_info["source"] = test_data["message"]["source"] expected_url_info["url_class"] = None expected_url_info["parent_url"] = None expected_url_info["root_url"] = test_data["message"]["original_url"] if not expected_url_info.has_key("original_url"): expected_url_info["original_url"] = None if not expected_url_info.has_key("crawl_priority"): expected_url_info["crawl_priority"] = 3 if not expected_url_info.has_key("crawl_depth"): expected_url_info["crawl_depth"] = 3 if not expected_url_info.has_key("discovered_count"): expected_url_info["discovered_count"] = 1 if not expected_url_info.has_key("modified_count"): expected_url_info["modified_count"] = 0 url_info = crawlerdb.db.urlRepository.find_one({"url" : url}) self.assertTrue(url_info is not None) self.assertTrue(url_info["last_discovered"] is not None) self.assertTrue(url_info["created_time"] is not None) if not test_data.has_key("expected_raw_doc"): self.assertTrue(url_info["last_modified"] is not None) self.assertTrue(url_info["first_modified"] is not None) for field, value in expected_url_info.items(): self.assertTrue(value == url_info[field], "%d,%s,%s,%s" %(i, field, value, url_info[field])) raw_doc = crawlerdb.db.rawDocs.find_one({"url" : url}) test_data["expected_raw_doc"] = test_data.get("expected_raw_doc", {}) if test_data["expected_raw_doc"] is None: self.assertTrue(raw_doc is None) else: self.assertTrue(raw_doc is not None) self._test_raw_doc(test_data["expected_raw_doc"], test_data["message"], raw_doc) self.assertTrue(crawlerdb.db.urlRedirects.count() == test_data["url_redirects_count"], "%d,%d,%d" % (i, crawlerdb.db.urlRedirects.count(), test_data["url_redirects_count"])) self.assertTrue(test_data["new_msg"] == (CrawlerResponseHandlerTest.mock_message_output != None)) if test_data["new_msg"]: msg = CrawlerResponseHandlerTest.mock_message_output CrawlerResponseHandlerTest.mock_message_output = None self.assertTrue(msg["url"] == url) #mock process finished crawlerdb.db.urlRepository.update({"url" : url}, {"$set" : {"crawl_status" : "alive"}}) crawlerdb.db.urlRepository.drop() crawlerdb.db.rawDocs.drop() crawlerdb.db.urlRedirects.drop()