Пример #1
0
    def test_crawl_limit(self):
        c = Crawler("http://a.com")
        c.SLEEP_TIME = 0

        def side_effect():
            c.process_q.pop(0)
        c._process_next_url = mock.Mock(side_effect=side_effect)
        c.render_sitemap = mock.Mock()

        c.URL_LIMIT = 10
        c.process_q = ["test"] * 5
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 5)

        c._process_next_url.call_count = 0
        c.process_q = ["test"] * 10
        c.URL_LIMIT = 5
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 5)

        c._process_next_url.call_count = 0
        c.process_q = ["test"] * 10
        c.URL_LIMIT = float("inf")
        c.crawl()
        self.assertEqual(c._process_next_url.call_count, 10)
Пример #2
0
    def test__process_next_url_blacklist(self):
        c = Crawler("http://a.com")
        c.bad_urls = {"http://a.com/a/b/c/": True}
        c.process_q.append("http://a.com/a/b/c/")

        c._make_request = mock.Mock(return_value=None)
        c._process_html = mock.Mock()

        c._process_next_url()
        self.assertEqual(len(c.process_q), 1)
        self.assertEqual(len(c.bad_urls), 2)

        c._process_next_url()
        self.assertEqual(len(c.process_q), 0)
        self.assertEqual(len(c.bad_urls), 2)

        self.assertEqual(c._process_html.call_count, 0)