def test__make_request(self): c = Crawler("http://test.com/") with mock.patch("Crawler.requests") as mock_requests: mock_requests.get.return_value = mock_response = mock.Mock() mock_response.text = True # Make sure it ignores non-200 responses mock_response.status_code = 404 self.assertEqual(c._make_request(""), None) mock_response.status_code = 200 # Make sure it ignores non-html responses mock_response.headers = { "content-type": "text/javascript" } self.assertEqual(c._make_request(""), None) mock_response.headers = { "content-type": "text/html" } # Make sure it ignores non-html responses self.assertEqual(c._make_request(""), True)
def test__process_next_url_blacklist(self): c = Crawler("http://a.com") c.bad_urls = {"http://a.com/a/b/c/": True} c.process_q.append("http://a.com/a/b/c/") c._make_request = mock.Mock(return_value=None) c._process_html = mock.Mock() c._process_next_url() self.assertEqual(len(c.process_q), 1) self.assertEqual(len(c.bad_urls), 2) c._process_next_url() self.assertEqual(len(c.process_q), 0) self.assertEqual(len(c.bad_urls), 2) self.assertEqual(c._process_html.call_count, 0)