def test_crawl_limit(self): c = Crawler("http://a.com") c.SLEEP_TIME = 0 def side_effect(): c.process_q.pop(0) c._process_next_url = mock.Mock(side_effect=side_effect) c.render_sitemap = mock.Mock() c.URL_LIMIT = 10 c.process_q = ["test"] * 5 c.crawl() self.assertEqual(c._process_next_url.call_count, 5) c._process_next_url.call_count = 0 c.process_q = ["test"] * 10 c.URL_LIMIT = 5 c.crawl() self.assertEqual(c._process_next_url.call_count, 5) c._process_next_url.call_count = 0 c.process_q = ["test"] * 10 c.URL_LIMIT = float("inf") c.crawl() self.assertEqual(c._process_next_url.call_count, 10)
def test__process_next_url_blacklist(self): c = Crawler("http://a.com") c.bad_urls = {"http://a.com/a/b/c/": True} c.process_q.append("http://a.com/a/b/c/") c._make_request = mock.Mock(return_value=None) c._process_html = mock.Mock() c._process_next_url() self.assertEqual(len(c.process_q), 1) self.assertEqual(len(c.bad_urls), 2) c._process_next_url() self.assertEqual(len(c.process_q), 0) self.assertEqual(len(c.bad_urls), 2) self.assertEqual(c._process_html.call_count, 0)