Пример #1
0
 def test_should_extract_urls_from_video(self):
     html = """<html><body>
     <video controls
            src="https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4"
            poster="https://peach.blender.org/wp-content/uploads/title_anouncement.jpg?x11217"
            width="620"></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(
                 resource=
                 'https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4',
                 kind='video-src',
                 initiator=self.test_url,
             ),
             Asset(
                 resource=
                 'https://peach.blender.org/wp-content/uploads/title_anouncement.jpg?x11217',
                 kind='video-poster',
                 initiator=self.test_url,
             ),
         ],
         extract_assets(soup, self.test_url),
     )
Пример #2
0
 def test_should_extract_images_from_srcset(self):
     html = """
     <html><body><img srcset="image-320w.jpg 320w,
          image-480w.jpg 480w,
          image-800w.jpg 800w"></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(
                 resource='image-320w.jpg',
                 kind='img-srcset',
                 initiator=self.test_url,
             ),
             Asset(
                 resource='image-480w.jpg',
                 kind='img-srcset',
                 initiator=self.test_url,
             ),
             Asset(
                 resource='image-800w.jpg',
                 kind='img-srcset',
                 initiator=self.test_url,
             ),
         ],
         extract_assets(soup, self.test_url),
     )
Пример #3
0
    def test_should_skip_assets_from_non_domains(self):
        assets = [
            Asset(resource='not-a-domain', kind='img-src', initiator='z.com'),
        ]

        self.assertEqual(
            scanner.parse_assets(assets, ['z.com', 'b.com']), {
                'ignored_cross_domain_assets': '',
                'no_cross_domain_assets': True,
                'cross_domain_asset_summary': '',
            })
Пример #4
0
 def test_should_extract_urls_from_embeds(self):
     html = """<html><body><embed type="video/quicktime" src="movie.mov" width="640" height="480"></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(resource='movie.mov',
                   kind='embed-src',
                   initiator=self.test_url)
         ],
         extract_assets(soup, self.test_url),
     )
Пример #5
0
 def test_should_extract_urls_from_audio(self):
     html = """<html><body><audio src="audio.wav"></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(resource='audio.wav',
                   kind='audio-src',
                   initiator=self.test_url)
         ],
         extract_assets(soup, self.test_url),
     )
Пример #6
0
    def test_should_skip_assets_on_permitted_domains(self):
        assets = [
            Asset(resource='http://a.com/a.gif',
                  kind='img-src',
                  initiator='z.com'),
            Asset(resource='http://b.com/b.gif',
                  kind='img-src',
                  initiator='z.com'),
            Asset(resource='http://z.com/z.gif',
                  kind='img-src',
                  initiator='z.com'),
        ]

        self.assertEqual(
            scanner.parse_assets(assets, ['z.com', 'b.com']), {
                'ignored_cross_domain_assets':
                '',
                'no_cross_domain_assets':
                False,
                'cross_domain_asset_summary':
                """z.com\n  * (img-src) http://a.com/a.gif\n"""
            })
Пример #7
0
 def test_should_extract_urls_from_sources(self):
     html = """<html><body>
     <video>
     <source src="video.webm" type="video/webm">
     <source src="video.ogg" type="video/ogg">
     <source src="video.mov" type="video/quicktime">
     </video></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(resource='video.webm',
                   kind='source-src',
                   initiator=self.test_url),
             Asset(resource='video.ogg',
                   kind='source-src',
                   initiator=self.test_url),
             Asset(resource='video.mov',
                   kind='source-src',
                   initiator=self.test_url),
         ],
         extract_assets(soup, self.test_url),
     )
Пример #8
0
    def test_should_extract_urls_in_external_js(self, mock_requests):
        mock_requests.get.return_value = mock.Mock(
            text=
            """function makeRequest() { $.getJSON('http://example.org/', function(data) {}); }"""
        )

        html = """
        <html><head><script src="file.js""></head><body></body></html>
        """
        soup = BeautifulSoup(html, "lxml")
        self.assertEqual(
            [
                Asset(resource='file.js',
                      kind='script-src',
                      initiator=self.test_url),
                Asset(
                    resource='http://example.org/',
                    kind='script-resource',
                    initiator='file.js',
                ),
            ],
            extract_assets(soup, self.test_url),
        )
Пример #9
0
 def test_should_extract_urls_in_linked_css(self, requests_mock):
     requests_mock.get.return_value = mock.Mock(
         text=
         'selector { background-image: url("https://example.org/example.png") }'
     )
     html = """
     <html><head><link href="https://example.org/styles.css" rel="stylesheet"></head><body></body></html>"""
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         set(extract_assets(soup, self.test_url)),
         {
             Asset(
                 resource='https://example.org/styles.css',
                 kind='style-href',
                 initiator=self.test_url,
             ),
             Asset(
                 resource='https://example.org/example.png',
                 kind='style-resource',
                 initiator='https://example.org/styles.css',
             ),
         },
     )
Пример #10
0
 def test_should_extract_urls_in_inline_css(self):
     html = """<html>
     <body style="background-image: url('https://example.org/files/example.png')"></body></html>"""
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(
                 resource='https://example.org/files/example.png',
                 kind='style-resource-inline',
                 initiator=self.test_url,
             )
         ],
         extract_assets(soup, self.test_url),
     )
Пример #11
0
    def test_should_extract_images(self):
        html = """
        <html><body><img src="image.jpg"></body></html>
        """
        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(resource='image.jpg',
                      kind='img-src',
                      initiator=self.test_url)
            ],
            extract_assets(soup, self.test_url),
        )
Пример #12
0
 def test_should_extract_embedded_scripts_with_urls(self):
     html = """
     <html><head><script>var url = 'http://www.example.org';</script></head><body></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         extract_assets(soup, self.test_url),
         [
             Asset(
                 resource='http://www.example.org',
                 kind='script-embed',
                 initiator=self.test_url,
             )
         ],
     )
Пример #13
0
    def test_should_extract_external_scripts(self, mock_requests):
        mock_requests.get.return_value = mock.Mock(text='')
        html = """
        <html><head><script src="script.js"></head><body></body></html>
        """
        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(resource='script.js',
                      kind='script-src',
                      initiator=self.test_url)
            ],
            extract_assets(soup, self.test_url),
        )
Пример #14
0
    def test_should_extract_urls_from_iframes(self):
        html = """
        <html><body><iframe src="https://www.example.org/embed.html"></iframe></body></html>
        """
        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(
                    resource='https://www.example.org/embed.html',
                    kind='iframe-src',
                    initiator=self.test_url,
                )
            ],
            extract_assets(soup, self.test_url),
        )
Пример #15
0
    def test_should_extract_links_to_stylesheets(self, mock_requests):
        html = """
        <html><head><link href="/media/example.css" rel="stylesheet"></head><body></body></html>
        """

        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(
                    resource='/media/example.css',
                    kind='style-href',
                    initiator=self.test_url,
                )
            ],
            extract_assets(soup, self.test_url),
        )
Пример #16
0
    def test_should_extract_urls_in_embedded_css(self):
        html = """<html><head><style>
        div {
          background-image: url("https://example.org/files/example.png");
        }
        </style></head><body></body></html>"""

        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(
                    resource='https://example.org/files/example.png',
                    kind='style-embed',
                    initiator=self.test_url,
                )
            ],
            extract_assets(soup, self.test_url),
        )