예제 #1
0
    def test_parse(self):
        def read(self):
            self.parse(ROBOTS.splitlines())

        with patch.object(Robots, 'add_result') as mock_add_result:
            with patch.object(DirhuntRobotFileParser, 'read', side_effect=read, autospec=True):
                Robots(lambda x: x, None).callback('domain.com')
                mock_add_result.assert_called_once_with('http://domain.com/secret/')
예제 #2
0
    def test_https(self):
        def read(self):
            if self.url.startswith('http:'):
                raise IOError
            self.parse(ROBOTS.splitlines())

        with patch.object(Robots, 'add_result') as mock_add_result:
            with patch.object(RobotFileParser,
                              'read',
                              side_effect=read,
                              autospec=True):
                Robots(lambda x: x).callback('domain.com')
                mock_add_result.assert_called_once_with(
                    'https://domain.com/secret/')
예제 #3
0
 def test_abuse(self, m):
     domain = 'domain.com'
     url = VT_URL.format(domain=domain)
     m.get(url, text=ABUSE)
     with patch.object(VirusTotal, 'add_error') as mock_add_error:
         VirusTotal(lambda x: x, lambda x: x).callback(domain)
         mock_add_error.assert_called()
예제 #4
0
 def test_use_one_line(self):
     html = '<html><title>Foo</title></html>'
     url_info = self._get_url_info()
     self._test_get_data(html, url_info)
     with patch.object(UrlInfo, 'one_line') as mock_one_line:
         url_info.line(300, len(self.url), 0)
         mock_one_line.assert_called_once()
예제 #5
0
 def test_str(self):
     with requests_mock.mock() as m:
         m.register_uri('GET', 'http://test.com', text=self.html, headers={'Location': 'http://foo/'},
                        status_code=300)
         r = requests.get('http://test.com')
     with patch.object(Crawler, 'add_url'):
         p = ProcessRedirect(r, self.get_crawler_url())
         p.process('')
     self.assertIn('http://foo/', str(p))
예제 #6
0
 def test_wordpress(self):
     html = """
     <script src="wp-content/myscripts.js"></script>
     """
     with patch.object(Crawler, 'add_url'):
         process = ProcessHtmlRequest(None, self.get_crawler_url())
         soup = BeautifulSoup(html, 'html.parser')
         process.assets(soup)
         self.assertIn('wordpress', process.crawler_url.flags)
예제 #7
0
 def test_urls(self, m):
     domain = 'domain.com'
     url = VT_URL.format(domain=domain)
     detect_urls = ['http://{}/{}'.format(domain, i) for i in range(10)]
     m.get(url, text='<html><body><div id="detected-urls">{}</div></body></html>'.format(
         '\n'.join([ABUSE_DIV.format(url=detect_url) for detect_url in detect_urls])
     ))
     with patch.object(VirusTotal, 'add_result') as mock_add_result:
         VirusTotal(lambda x: x, None).callback(domain)
         mock_add_result.assert_has_calls([call(detect_url) for detect_url in detect_urls])
예제 #8
0
 def test_process(self):
     with requests_mock.mock() as m:
         m.register_uri('GET', 'http://test.com', text=self.html, headers={'Location': 'http://foo/'},
                        status_code=300)
         r = requests.get('http://test.com')
     with patch.object(Crawler, 'add_url') as mock_method:
         p = ProcessRedirect(r, self.get_crawler_url())
         p.process('')
         urls = [crawler_url[0][0].url.url for crawler_url in mock_method.call_args_list]
         self.assertEqual(urls, ['http://foo/'])
예제 #9
0
 def test_process(self):
     html = """
     <a href="dir/">dir</a>
     <script src="myscripts.js"></script>        
     """
     with patch.object(Crawler, 'add_url') as mock_method:
         process = ProcessHtmlRequest(None, self.get_crawler_url())
         soup = BeautifulSoup(html, 'html.parser')
         process.process(html, soup)
         urls = [crawler_url[0][0].url.url for crawler_url in mock_method.call_args_list]
         self.assertEqual(set(urls), {
             'http://domain.com/path/myscripts.js',
             'http://domain.com/path/dir/',
             'http://domain.com/path/index.php',
         })
예제 #10
0
 def test_links(self):
     html = """
     <a href="..">Top</a>
     <a href="dir/">dir</a>
     <a href="foo.php">foo.php</a>
     <a href="/spam/eggs">Eggs</a>
     """
     with patch.object(Crawler, 'add_url') as mock_method:
         process = ProcessHtmlRequest(None, self.get_crawler_url())
         soup = BeautifulSoup(html, 'html.parser')
         process.links(soup)
         urls = [crawler_url[0][0].url for crawler_url in mock_method.call_args_list]
         self.assertEqual(urls, [
             'http://domain.com/',
             'http://domain.com/path/dir/',
             'http://domain.com/path/foo.php',
             'http://domain.com/spam/eggs',
         ])
예제 #11
0
 def test_assets(self):
     html = """
     <link rel="stylesheet" type="text/css" href="spam/theme.css">
     <script src="myscripts.js"></script>
     <script src="//cnd.extern.com/script.js"></script>        
     <img src="/smiley.gif"> 
     <img src="proto:invalid:url"> <!-- Ignore -->
     """
     with patch.object(Crawler, 'add_url') as mock_method:
         process = ProcessHtmlRequest(None, self.get_crawler_url())
         soup = BeautifulSoup(html, 'html.parser')
         process.assets(soup)
         urls = [crawler_url[0][0].url for crawler_url in mock_method.call_args_list]
         self.assertEqual(urls, [
             'http://domain.com/path/spam/theme.css',
             'http://domain.com/path/myscripts.js',
             'http://cnd.extern.com/script.js',
             'http://domain.com/smiley.gif',
         ])
예제 #12
0
 def test_add_init_urls(self):
     crawler = self.get_crawler()
     with patch.object(Crawler, 'add_url') as m:
         crawler.add_init_urls(self.url)
         m.assert_called_once()
         self.assertEqual(crawler.domains, {'domain.com'})
예제 #13
0
 def test_add_url(self):
     crawler = self.get_crawler()
     crawler.domains.add('domain.com')
     crawler_url = CrawlerUrl(crawler, self.url)
     with patch.object(ThreadPoolExecutor, 'submit') as mock_method:
         crawler.add_url(crawler_url)
예제 #14
0
 def test_start_empty(self):
     with patch.object(UrlsInfo, 'submit') as m:
         UrlsInfo([], Sessions()).start()
         m.assert_not_called()
예제 #15
0
 def test_callback(self):
     with patch.object(UrlsInfo, '_get_url_info') as m:
         UrlsInfo([self.url], Sessions()).callback(len(self.url),
                                                   Url(self.url), 0)
         m.assert_called_once()