Пример #1
0
 def test_wordpress(self):
     html = """
     <script src="wp-content/myscripts.js"></script>
     """
     with patch.object(Crawler, 'add_url'):
         process = ProcessHtmlRequest(None, self.get_crawler_url())
         soup = BeautifulSoup(html, 'html.parser')
         process.assets(soup)
         self.assertIn('wordpress', process.crawler_url.flags)
Пример #2
0
 def test_process(self):
     html = """
     <a href="dir/">dir</a>
     <script src="myscripts.js"></script>        
     """
     with patch.object(Crawler, 'add_url') as mock_method:
         process = ProcessHtmlRequest(None, self.get_crawler_url())
         soup = BeautifulSoup(html, 'html.parser')
         process.process(html, soup)
         urls = [crawler_url[0][0].url.url for crawler_url in mock_method.call_args_list]
         self.assertEqual(set(urls), {
             'http://domain.com/path/myscripts.js',
             'http://domain.com/path/dir/',
             'http://domain.com/path/index.php',
         })
Пример #3
0
 def test_links(self):
     html = """
     <a href="..">Top</a>
     <a href="dir/">dir</a>
     <a href="foo.php">foo.php</a>
     <a href="/spam/eggs">Eggs</a>
     """
     with patch.object(Crawler, 'add_url') as mock_method:
         process = ProcessHtmlRequest(None, self.get_crawler_url())
         soup = BeautifulSoup(html, 'html.parser')
         process.links(soup)
         urls = [crawler_url[0][0].url for crawler_url in mock_method.call_args_list]
         self.assertEqual(urls, [
             'http://domain.com/',
             'http://domain.com/path/dir/',
             'http://domain.com/path/foo.php',
             'http://domain.com/spam/eggs',
         ])
Пример #4
0
 def test_assets(self):
     html = """
     <link rel="stylesheet" type="text/css" href="spam/theme.css">
     <script src="myscripts.js"></script>
     <script src="//cnd.extern.com/script.js"></script>        
     <img src="/smiley.gif"> 
     <img src="proto:invalid:url"> <!-- Ignore -->
     """
     with patch.object(Crawler, 'add_url') as mock_method:
         process = ProcessHtmlRequest(None, self.get_crawler_url())
         soup = BeautifulSoup(html, 'html.parser')
         process.assets(soup)
         urls = [crawler_url[0][0].url for crawler_url in mock_method.call_args_list]
         self.assertEqual(urls, [
             'http://domain.com/path/spam/theme.css',
             'http://domain.com/path/myscripts.js',
             'http://cnd.extern.com/script.js',
             'http://domain.com/smiley.gif',
         ])