def test_should_extract_urls_from_video(self): html = """<html><body> <video controls src="https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4" poster="https://peach.blender.org/wp-content/uploads/title_anouncement.jpg?x11217" width="620"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource= 'https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4', kind='video-src', initiator=self.test_url, ), Asset( resource= 'https://peach.blender.org/wp-content/uploads/title_anouncement.jpg?x11217', kind='video-poster', initiator=self.test_url, ), ], extract_assets(soup, self.test_url), )
def test_should_extract_images_from_srcset(self): html = """ <html><body><img srcset="image-320w.jpg 320w, image-480w.jpg 480w, image-800w.jpg 800w"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='image-320w.jpg', kind='img-srcset', initiator=self.test_url, ), Asset( resource='image-480w.jpg', kind='img-srcset', initiator=self.test_url, ), Asset( resource='image-800w.jpg', kind='img-srcset', initiator=self.test_url, ), ], extract_assets(soup, self.test_url), )
def test_should_skip_assets_from_non_domains(self): assets = [ Asset(resource='not-a-domain', kind='img-src', initiator='z.com'), ] self.assertEqual( scanner.parse_assets(assets, ['z.com', 'b.com']), { 'ignored_cross_domain_assets': '', 'no_cross_domain_assets': True, 'cross_domain_asset_summary': '', })
def test_should_extract_urls_from_embeds(self): html = """<html><body><embed type="video/quicktime" src="movie.mov" width="640" height="480"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='movie.mov', kind='embed-src', initiator=self.test_url) ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_from_audio(self): html = """<html><body><audio src="audio.wav"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='audio.wav', kind='audio-src', initiator=self.test_url) ], extract_assets(soup, self.test_url), )
def test_should_skip_assets_on_permitted_domains(self): assets = [ Asset(resource='http://a.com/a.gif', kind='img-src', initiator='z.com'), Asset(resource='http://b.com/b.gif', kind='img-src', initiator='z.com'), Asset(resource='http://z.com/z.gif', kind='img-src', initiator='z.com'), ] self.assertEqual( scanner.parse_assets(assets, ['z.com', 'b.com']), { 'ignored_cross_domain_assets': '', 'no_cross_domain_assets': False, 'cross_domain_asset_summary': """z.com\n * (img-src) http://a.com/a.gif\n""" })
def test_should_extract_urls_from_sources(self): html = """<html><body> <video> <source src="video.webm" type="video/webm"> <source src="video.ogg" type="video/ogg"> <source src="video.mov" type="video/quicktime"> </video></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='video.webm', kind='source-src', initiator=self.test_url), Asset(resource='video.ogg', kind='source-src', initiator=self.test_url), Asset(resource='video.mov', kind='source-src', initiator=self.test_url), ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_in_external_js(self, mock_requests): mock_requests.get.return_value = mock.Mock( text= """function makeRequest() { $.getJSON('http://example.org/', function(data) {}); }""" ) html = """ <html><head><script src="file.js""></head><body></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='file.js', kind='script-src', initiator=self.test_url), Asset( resource='http://example.org/', kind='script-resource', initiator='file.js', ), ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_in_linked_css(self, requests_mock): requests_mock.get.return_value = mock.Mock( text= 'selector { background-image: url("https://example.org/example.png") }' ) html = """ <html><head><link href="https://example.org/styles.css" rel="stylesheet"></head><body></body></html>""" soup = BeautifulSoup(html, "lxml") self.assertEqual( set(extract_assets(soup, self.test_url)), { Asset( resource='https://example.org/styles.css', kind='style-href', initiator=self.test_url, ), Asset( resource='https://example.org/example.png', kind='style-resource', initiator='https://example.org/styles.css', ), }, )
def test_should_extract_urls_in_inline_css(self): html = """<html> <body style="background-image: url('https://example.org/files/example.png')"></body></html>""" soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='https://example.org/files/example.png', kind='style-resource-inline', initiator=self.test_url, ) ], extract_assets(soup, self.test_url), )
def test_should_extract_images(self): html = """ <html><body><img src="image.jpg"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='image.jpg', kind='img-src', initiator=self.test_url) ], extract_assets(soup, self.test_url), )
def test_should_extract_embedded_scripts_with_urls(self): html = """ <html><head><script>var url = 'http://www.example.org';</script></head><body></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( extract_assets(soup, self.test_url), [ Asset( resource='http://www.example.org', kind='script-embed', initiator=self.test_url, ) ], )
def test_should_extract_external_scripts(self, mock_requests): mock_requests.get.return_value = mock.Mock(text='') html = """ <html><head><script src="script.js"></head><body></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='script.js', kind='script-src', initiator=self.test_url) ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_from_iframes(self): html = """ <html><body><iframe src="https://www.example.org/embed.html"></iframe></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='https://www.example.org/embed.html', kind='iframe-src', initiator=self.test_url, ) ], extract_assets(soup, self.test_url), )
def test_should_extract_links_to_stylesheets(self, mock_requests): html = """ <html><head><link href="/media/example.css" rel="stylesheet"></head><body></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='/media/example.css', kind='style-href', initiator=self.test_url, ) ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_in_embedded_css(self): html = """<html><head><style> div { background-image: url("https://example.org/files/example.png"); } </style></head><body></body></html>""" soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='https://example.org/files/example.png', kind='style-embed', initiator=self.test_url, ) ], extract_assets(soup, self.test_url), )