def test_parse(self): def read(self): self.parse(ROBOTS.splitlines()) with patch.object(Robots, 'add_result') as mock_add_result: with patch.object(DirhuntRobotFileParser, 'read', side_effect=read, autospec=True): Robots(lambda x: x, None).callback('domain.com') mock_add_result.assert_called_once_with('http://domain.com/secret/')
def test_https(self): def read(self): if self.url.startswith('http:'): raise IOError self.parse(ROBOTS.splitlines()) with patch.object(Robots, 'add_result') as mock_add_result: with patch.object(RobotFileParser, 'read', side_effect=read, autospec=True): Robots(lambda x: x).callback('domain.com') mock_add_result.assert_called_once_with( 'https://domain.com/secret/')
def test_abuse(self, m): domain = 'domain.com' url = VT_URL.format(domain=domain) m.get(url, text=ABUSE) with patch.object(VirusTotal, 'add_error') as mock_add_error: VirusTotal(lambda x: x, lambda x: x).callback(domain) mock_add_error.assert_called()
def test_use_one_line(self): html = '<html><title>Foo</title></html>' url_info = self._get_url_info() self._test_get_data(html, url_info) with patch.object(UrlInfo, 'one_line') as mock_one_line: url_info.line(300, len(self.url), 0) mock_one_line.assert_called_once()
def test_str(self): with requests_mock.mock() as m: m.register_uri('GET', 'http://test.com', text=self.html, headers={'Location': 'http://foo/'}, status_code=300) r = requests.get('http://test.com') with patch.object(Crawler, 'add_url'): p = ProcessRedirect(r, self.get_crawler_url()) p.process('') self.assertIn('http://foo/', str(p))
def test_wordpress(self): html = """ <script src="wp-content/myscripts.js"></script> """ with patch.object(Crawler, 'add_url'): process = ProcessHtmlRequest(None, self.get_crawler_url()) soup = BeautifulSoup(html, 'html.parser') process.assets(soup) self.assertIn('wordpress', process.crawler_url.flags)
def test_urls(self, m): domain = 'domain.com' url = VT_URL.format(domain=domain) detect_urls = ['http://{}/{}'.format(domain, i) for i in range(10)] m.get(url, text='<html><body><div id="detected-urls">{}</div></body></html>'.format( '\n'.join([ABUSE_DIV.format(url=detect_url) for detect_url in detect_urls]) )) with patch.object(VirusTotal, 'add_result') as mock_add_result: VirusTotal(lambda x: x, None).callback(domain) mock_add_result.assert_has_calls([call(detect_url) for detect_url in detect_urls])
def test_process(self): with requests_mock.mock() as m: m.register_uri('GET', 'http://test.com', text=self.html, headers={'Location': 'http://foo/'}, status_code=300) r = requests.get('http://test.com') with patch.object(Crawler, 'add_url') as mock_method: p = ProcessRedirect(r, self.get_crawler_url()) p.process('') urls = [crawler_url[0][0].url.url for crawler_url in mock_method.call_args_list] self.assertEqual(urls, ['http://foo/'])
def test_process(self): html = """ <a href="dir/">dir</a> <script src="myscripts.js"></script> """ with patch.object(Crawler, 'add_url') as mock_method: process = ProcessHtmlRequest(None, self.get_crawler_url()) soup = BeautifulSoup(html, 'html.parser') process.process(html, soup) urls = [crawler_url[0][0].url.url for crawler_url in mock_method.call_args_list] self.assertEqual(set(urls), { 'http://domain.com/path/myscripts.js', 'http://domain.com/path/dir/', 'http://domain.com/path/index.php', })
def test_links(self): html = """ <a href="..">Top</a> <a href="dir/">dir</a> <a href="foo.php">foo.php</a> <a href="/spam/eggs">Eggs</a> """ with patch.object(Crawler, 'add_url') as mock_method: process = ProcessHtmlRequest(None, self.get_crawler_url()) soup = BeautifulSoup(html, 'html.parser') process.links(soup) urls = [crawler_url[0][0].url for crawler_url in mock_method.call_args_list] self.assertEqual(urls, [ 'http://domain.com/', 'http://domain.com/path/dir/', 'http://domain.com/path/foo.php', 'http://domain.com/spam/eggs', ])
def test_assets(self): html = """ <link rel="stylesheet" type="text/css" href="spam/theme.css"> <script src="myscripts.js"></script> <script src="//cnd.extern.com/script.js"></script> <img src="/smiley.gif"> <img src="proto:invalid:url"> <!-- Ignore --> """ with patch.object(Crawler, 'add_url') as mock_method: process = ProcessHtmlRequest(None, self.get_crawler_url()) soup = BeautifulSoup(html, 'html.parser') process.assets(soup) urls = [crawler_url[0][0].url for crawler_url in mock_method.call_args_list] self.assertEqual(urls, [ 'http://domain.com/path/spam/theme.css', 'http://domain.com/path/myscripts.js', 'http://cnd.extern.com/script.js', 'http://domain.com/smiley.gif', ])
def test_add_init_urls(self): crawler = self.get_crawler() with patch.object(Crawler, 'add_url') as m: crawler.add_init_urls(self.url) m.assert_called_once() self.assertEqual(crawler.domains, {'domain.com'})
def test_add_url(self): crawler = self.get_crawler() crawler.domains.add('domain.com') crawler_url = CrawlerUrl(crawler, self.url) with patch.object(ThreadPoolExecutor, 'submit') as mock_method: crawler.add_url(crawler_url)
def test_start_empty(self): with patch.object(UrlsInfo, 'submit') as m: UrlsInfo([], Sessions()).start() m.assert_not_called()
def test_callback(self): with patch.object(UrlsInfo, '_get_url_info') as m: UrlsInfo([self.url], Sessions()).callback(len(self.url), Url(self.url), 0) m.assert_called_once()