def test_should_extract_images_from_srcset(self): html = """ <html><body><img srcset="image-320w.jpg 320w, image-480w.jpg 480w, image-800w.jpg 800w"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='image-320w.jpg', kind='img-srcset', initiator=self.test_url, ), Asset( resource='image-480w.jpg', kind='img-srcset', initiator=self.test_url, ), Asset( resource='image-800w.jpg', kind='img-srcset', initiator=self.test_url, ), ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_from_video(self): html = """<html><body> <video controls src="https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4" poster="https://peach.blender.org/wp-content/uploads/title_anouncement.jpg?x11217" width="620"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource= 'https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4', kind='video-src', initiator=self.test_url, ), Asset( resource= 'https://peach.blender.org/wp-content/uploads/title_anouncement.jpg?x11217', kind='video-poster', initiator=self.test_url, ), ], extract_assets(soup, self.test_url), )
def perform_scan(url: str, permitted_domains: List[str]) -> ScanResult: scan_data = { 'live': False, 'landing_page_url': url, } try: page, soup = request_and_scrape_page(url) except requests.exceptions.RequestException: # Connection timed out, an invalid HTTP response was returned, or # a network problem occurred. # Catch the base class exception for these cases. scan_data['http_status_200_ok'] = False return ScanResult(**scan_data) http_response_data = parse_page_data(page) scan_data.update(http_response_data) content_data = parse_soup_data(soup) scan_data.update(content_data) assets = extract_assets(soup, page.url) asset_results = parse_assets(assets, [tldextract.extract(page.url).registered_domain] + permitted_domains) scan_data.update(asset_results) pshtt_results = inspect_domains([url_to_domain(page.url)], {'timeout': 10}) https_data = parse_pshtt_data(pshtt_results[0]) scan_data.update(https_data) return ScanResult(**scan_data)
def test_should_extract_urls_from_embeds(self): html = """<html><body><embed type="video/quicktime" src="movie.mov" width="640" height="480"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='movie.mov', kind='embed-src', initiator=self.test_url) ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_from_audio(self): html = """<html><body><audio src="audio.wav"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='audio.wav', kind='audio-src', initiator=self.test_url) ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_in_inline_css(self): html = """<html> <body style="background-image: url('https://example.org/files/example.png')"></body></html>""" soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='https://example.org/files/example.png', kind='style-resource-inline', initiator=self.test_url, ) ], extract_assets(soup, self.test_url), )
def test_should_extract_images(self): html = """ <html><body><img src="image.jpg"></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='image.jpg', kind='img-src', initiator=self.test_url) ], extract_assets(soup, self.test_url), )
def test_should_extract_embedded_scripts_with_urls(self): html = """ <html><head><script>var url = 'http://www.example.org';</script></head><body></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( extract_assets(soup, self.test_url), [ Asset( resource='http://www.example.org', kind='script-embed', initiator=self.test_url, ) ], )
def test_should_extract_external_scripts(self, mock_requests): mock_requests.get.return_value = mock.Mock(text='') html = """ <html><head><script src="script.js"></head><body></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='script.js', kind='script-src', initiator=self.test_url) ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_from_iframes(self): html = """ <html><body><iframe src="https://www.example.org/embed.html"></iframe></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='https://www.example.org/embed.html', kind='iframe-src', initiator=self.test_url, ) ], extract_assets(soup, self.test_url), )
def test_should_extract_links_to_stylesheets(self, mock_requests): html = """ <html><head><link href="/media/example.css" rel="stylesheet"></head><body></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='/media/example.css', kind='style-href', initiator=self.test_url, ) ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_in_embedded_css(self): html = """<html><head><style> div { background-image: url("https://example.org/files/example.png"); } </style></head><body></body></html>""" soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset( resource='https://example.org/files/example.png', kind='style-embed', initiator=self.test_url, ) ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_from_sources(self): html = """<html><body> <video> <source src="video.webm" type="video/webm"> <source src="video.ogg" type="video/ogg"> <source src="video.mov" type="video/quicktime"> </video></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='video.webm', kind='source-src', initiator=self.test_url), Asset(resource='video.ogg', kind='source-src', initiator=self.test_url), Asset(resource='video.mov', kind='source-src', initiator=self.test_url), ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_in_external_js(self, mock_requests): mock_requests.get.return_value = mock.Mock( text= """function makeRequest() { $.getJSON('http://example.org/', function(data) {}); }""" ) html = """ <html><head><script src="file.js""></head><body></body></html> """ soup = BeautifulSoup(html, "lxml") self.assertEqual( [ Asset(resource='file.js', kind='script-src', initiator=self.test_url), Asset( resource='http://example.org/', kind='script-resource', initiator='file.js', ), ], extract_assets(soup, self.test_url), )
def test_should_extract_urls_in_linked_css(self, requests_mock): requests_mock.get.return_value = mock.Mock( text= 'selector { background-image: url("https://example.org/example.png") }' ) html = """ <html><head><link href="https://example.org/styles.css" rel="stylesheet"></head><body></body></html>""" soup = BeautifulSoup(html, "lxml") self.assertEqual( set(extract_assets(soup, self.test_url)), { Asset( resource='https://example.org/styles.css', kind='style-href', initiator=self.test_url, ), Asset( resource='https://example.org/example.png', kind='style-resource', initiator='https://example.org/styles.css', ), }, )